Quotidian: May 2008

Saturday, May 31, 2008

Gibbs sampler for 1d Ising model

This depends on Matsumoto's Mersenne Twister.

gibbs1d.h:
#ifndef __GIBBS1D_H__
#define __GIBBS1D_H__

#include <vector>

/**
 * Produces a sequence of variates with P(x) propto exp(-beta * H(nodes))
 * as its stationary distribution, where H(nodes) = the number of consecutive
 * pairs with equal state.
 */
class Gibbs1d {
  
public:

  /**
   * beta -- larger values imply larger affinities for forming clusters.
   * numNodes -- number of points along the line.
   * q -- number of states for each point.
   */
  Gibbs1d(double beta, int numNodes, int q = 2);

  /**
   * Update the state of each node by Gibbs sampling.
   */
  void iterate();

  /**
   * Returns a pointer to a numNodes length array giving
   * the current state of each node.
   */
  const int* current() const;

private:
  //Hide the copy ctor and assignment operator
  Gibbs1d(const Gibbs1d&);
  Gibbs1d& operator=(const Gibbs1d&);

  const double beta;
  const int numNodes;
  const int q;
  std::vector<int> states;
  const double probNotEq1;
  const double probNotEq2;
  const double probEq;
  const double prob1Out;
};

#endif

gibbs1d.cpp:
#include <algorithm>
#include <cmath>
#include "gibbs1d.h"
#include "mtrandom.h"
using namespace std;

/**
 * beta -- larger values imply larger affinities for forming clusters.
 * numNodes -- number of points along the line.
 * q -- number of states for each point.
 */
Gibbs1d::Gibbs1d(double beta, int numNodes, int q)
  : beta(beta),
    numNodes(numNodes),
    q(q),
    states(numNodes + 2),
    probNotEq1(exp(beta) / (2.0 * exp(beta) + q - 2)),
    probNotEq2(2.0 * probNotEq1),
    probEq(exp(2.0 * beta) / (exp(2.0 * beta) + q - 1)),
    prob1Out(exp(beta) / (exp(beta) + q - 1))
{
  //Add sentinel values to the ends to avoid special casing
  //the boundaries.
  states[0] = states[numNodes + 1] = -1;
}

/**
 * Returns an integer in the interval [0, q-1]
 * != exclude, assigning equal probability to each
 * allowed value.
 * Assumes exclude is in [0, q-1].
 */
static int uniformNotEq(int q, int exclude)
{
  int r = (int) ((q - 1) * genrand_real2());
  if (r >= exclude) {
    return r + 1;
  }
  else {
    return r;
  }
}

/**
 * Returns an integer in the interval [0, q-1]
 * != exclude1 or exclude2, assigning equal probability to each
 * allowed value.
 * Assumes exclude1 and exclude2 are in [0, q-1]
 * and that exclude1 != exclude2.
 */
static int uniformNotEq(int q, int exclude1, int exclude2)
{
  if (exclude1 > exclude2) {
    swap(exclude1, exclude2);
  }

  int r = (int) ((q - 2) * genrand_real2());
  if (r >= exclude1) {
    r = r + 1;
  }
  
  if (r >= exclude2) {
    r = r + 1;
  }

  return r;
}

/**
 * Returns the given state with the given probability.
 * Otherwise, chooses one of the other q - 1 states
 * uniformly.
 */
static int withProbElseUniformNotEq(double prob, int state, int q)
{
  const double u = genrand_real2();
  if (u <= prob) {
    return state;
  }
  else {
    return uniformNotEq(q, state);
  }
}

void Gibbs1d::iterate()
{
  for (int i = 1; i <= numNodes; ++i) {
    const int left = states[i - 1];
    const int right = states[i + 1];

    if (left >= 0) {
      if (right >= 0) {
 if (left != right) {
   const double u = genrand_real2();
   if (u <= probNotEq1) {
     states[i] = left;
   }
   else if (u <= probNotEq2) {
     states[i] = right;
   }
   else {
     states[i] = uniformNotEq(q, left, right);
   }
 }
 else { // left == right
   states[i] = withProbElseUniformNotEq(probEq, left, q);
 }
      }
      else { //right < 0
 states[i] = withProbElseUniformNotEq(prob1Out, left, q);
      }
    }
    else {
      if (right >= 0) {
 states[i] = withProbElseUniformNotEq(prob1Out, right, q);
      }
      else {
 states[i] = (int) (q * genrand_real2());
      }
    }
  }
}

/**
 * Returns a pointer to a numNodes length array giving
 * the current state of each node.
 */
const int* Gibbs1d::current() const
{
  return &states[1];
}

I think there must be a much more elegant implementation of this. For one thing, the use of "sentinel" values isn't providing much of a payoff here. And the code for computing the probabilities is just very ugly. What's a better alternative to the nested if-statements?

//I really hate how Blogger cuts off the edges of wide elements. I suppose I can edit the style to fix this.

Friday, May 30, 2008

%e

From the Maxima mailing list today:
(%i4) load("simplify_sum")$ (%i5) simplify_sum(sum(1/n!,n,0,inf)); (%o5) %e

Someone had noted that sum(1/n!,n,0,inf),simpsum; wasn't evaluating to %e.

Someone else pointed out

(%i6) taylor(%e^n, n, 0, 6);
                            2    3    4    5     6
                           n    n    n    n     n
(%o6)/T/           1 + n + -- + -- + -- + --- + --- + . . .
                           2    6    24   120   720

What else does Maxima know?

(%i8) sum(k, k, 1, n), simpsum;
                                     2
                                    n  + n
(%o8)                               ------
                                      2
...
(%i10) simplify_sum(sum((-1)^(k+1) / k, k, 1, inf));
                          - 2 log(2) - %gamma   %gamma
(%o10)                  - ------------------- - ------
                                   2              2
(%i11) ratsimp(%);
(%o11)                              log(2)
(%i12) simplify_sum(sum(1/k^2, k, 1, inf));
                                        2
                                     %pi
(%o12)                               ----
                                      6
(%i13) simplify_sum(sum(1/k^3, k, 1, inf));
(%o13)                              zeta(3)
...
(%i17) simplify_sum(sum(4^(-k) * z^(2*k)/(k!)^2, k, 0, inf));
                                 inf
                                 ====    2 k
                                 \      z
(%o17)                            >    ------
                                 /      k   2
                                 ====  4  k!
                                 k = 0
... No Bessel functions?
(%i19) simplify_sum(sum(1/k^10, k, 1, inf));
                                        10
                                     %pi
(%o19)                               -----
                                     93555
(%i20) simplify_sum(sum(1/k^50, k, 1, inf));
                                                     50
                       39604576419286371856998202 %pi
(%o20)        ---------------------------------------------------
              285258771457546764463363635252374414183254365234375
...
(%i22) simplify_sum(sum(1/(k^2 + 1), k, 1, inf));
                       %i psi (1 - %i)   %i psi (%i + 1)
                             0                 0
(%o22)                 --------------- - ---------------
                              2                 2
... What is this?

Tuesday, May 27, 2008

Problem solving

I have enjoyed looking at Chris Chatfield's book "Problem solving: a statistician's guide." A few points from his summary section "How to be an effective statistician": "Look at the data... errors are inevitable when processing a large-scale data set, and steps must be taken to deal with them." [On the other hand, "large scale" often means "collected automatically", and the less human intervention, the less opportunity to screw it up.]
"Rather than asking 'What technique shall I use here?' it may be better to ask 'How can I summarize these data and understand them?'" "If an effect is 'clear', then the exact choice of analysis procedure may not be crucial." [Should we just be looking at testing for effects? Is there not an effect in the real world?] "A simple approach is often to be preferred to a complicated approach, as the former is easier for the 'client' to understand and is less likely to lead to serious blunders." [If something is not understandable, it's not verifiable. (But if it's not understandable, what good is it anyway?]

Articles by Chatfield referenced in the book:
Teaching a course in applied statistics
The initial examination of data
Model Uncertainty, Data Mining and Statistical Inference
Avoiding Statistical Pitfalls

Also referenced:
Teaching and Examining Applied Statistics by A.G. Hawkes

Leaking some Internal primes

I'm still just getting a feel for the C++ code implementing the algorithm described "Stochastic Graph Partition: Generalizing the Swendsen-Wang Method." (Author's post on LJ.) One thing I noticed right away though is that it depends on tr1! The readme mentioned it being compiled on a Mac using gcc 4.0.1, and I thought, "I have that! Do I already have TR1?!" A quick check with Spotlight revealed that it was indeed there. So, now here is some useless code depending on implementation details which shouldn't be used:

#include <algorithm>
#include <tr1/hashtable>
#include <iostream>
#include <iterator>

int main()
{
  Internal::X<0> x;
  std::copy(x.primes, x.primes + x.n_primes,
     std::ostream_iterator<unsigned long>(std::cout, "\n"));
}

Monday, May 26, 2008

compromise

Richard Morey provides a solution to the indentation wars (in R):

Forget indenting. I like to just make my code all one line, with commands separated by ";".

A sufficient condition for a mixture of equal variance normals to be unimodal:
abs(mu1 - mu2) <= 2 * sigma * sqrt(1 + abs(log(p) - log(1 - p)) / 2) -- Everitt and Hand, "Finite Mixture Distribution", referencing Behboodian 1970. [Is this the right article?] Does estimation of the parameters become much harder when the threshold from bimodal to unimodal is crossed? What are similar conditions for a mixture with k components? For a mixture of k Ricians?

> sigma = 1 > # p = 0.5 > mu <- rep(c(0, 1), c(5000, 5000)) > y <- rnorm(10000, mean=mu, sd=sigma) > hist(y) ##Appears to be unimodal > mu <- rep(c(0, 3), c(5000, 5000)) > y <- rnorm(10000, mean=mu, sd=sigma) > hist(y) ##Appears to be bimodal

> mu <- rep(c(0, 1), c(5000, 5000)) > y <- sqrt(rnorm(10000, mean=mu, sd=sigma)^2 + rnorm(10000, sd=sigma)^2) > hist(y, nclass=100) ##Appears to be unimodal > mu <- rep(c(0, 3), c(5000, 5000)) > y <- sqrt(rnorm(10000, mean=mu, sd=sigma)^2 + rnorm(10000, sd=sigma)^2) > hist(y, nclass=100) ##Appears to be bimodal

The same pattern shows up using plot(density(...)) in place of hist(...).

Monday, May 19, 2008

random partition

#include <assert.h>
#include <math.h>
#include "partition.h"
#include <Rmath.h>
#include <stdlib.h>

/**
 * Returns a array of length numClasses normalized to sum
 * to 1.
 */
static double* randomProportions(int numClasses)
{
  double* props = malloc(sizeof(double) * numClasses);
  int i;
  double total = 0.0;

  for (i = 0; i < numClasses; ++i) {
    props[i] = runif(0.0, 1.0);
    total += props[i];
  }

  for (i = 0; i < numClasses; ++i) {
    props[i] /= total;
  }
  
  return props;
}

/**
 * Ensures at least one element is assigned to each
 * class. The caller is responsible for freeing the
 * returned object.
 */
int* randomCounts(int len, int numClasses)
{
  int* counts = malloc(sizeof(int) * numClasses);
  double* props = randomProportions(numClasses);
  const double remaining = len - numClasses;
  int total = 0;
  int i;

  for (i = 0; i < numClasses; ++i) {
    counts[i] = 1;
  }
  
  for (i = 1; i < numClasses; ++i) {
    const int extra = (int) round(remaining * props[i]);
    total += extra;
    counts[i] += extra;
  }

  counts[0] += remaining - total;

  free(props);
  
  return counts;
}

/**
 * On exit, classes[i] = the class assigned to the kth class,
 * k = 0:(numClasses - 1). Assumes classes has length at least len.
 * Assumes that numClasses <= len. Assigns at least one element
 * to each class. The classes are assigned in blocks, ie, they
 * partition the indeces 0:(len - 1) into numClasses classes,
 * ordered from class 0 and up.
 */
void randomPartition(int len, int numClasses, int* classes)
{
  int* counts = randomCounts(len, numClasses);
  int i;
  int currentClass = 0;

  assert(numClasses <= len);

  for (i = 0; i < len; ++i) {
    classes[i] = currentClass;

    --counts[currentClass];

    if (!counts[currentClass]) {
      ++currentClass;
    }
  }

  free(counts);
}

I'm not completely happy with this design. I should probably have made "randomPartition" take the counts as an argument (although then it wouldn't've been so "random") since at least for my work, we're almost always interested in the counts (and will just have to recalculate them if they're not available). I wonder what standard implementations of this are available. I suppose I should be checking that the mallocs are succeeding as well. Are there any common platforms where accessing address 0 doesn't lead to a seg fault? Or what platforms don't segfault on accessing 0?

fast software development

How to Prototype a Game in Under 7 Days:
Tips and Tricks from 4 Grad Students Who Made Over 50 Games in 1 Semester. I've had this tab open for a while (weeks?). I think there are some valuable lessons here anyway. How should we test such ideas more formally? Something like the following?: Randomly split a pool of developers into two groups. Instruct group A to follow development methodology X for x days and group B to follow methodology Y for y days. Then after a rest period, instruct A to follow Y for y days and B to follow X for x days, finally releasing the products from every period simultaneously, taking steps to ensure equal exposure (a new random ordering on the public project page for each visit) and gathering the public's ratings of the products.

Saturday, May 17, 2008

a bug from using optim for 1d problem

Recently, a bug was described on r-devel, one introduced by using optim for a one-dimensional optimization problem (rather than optimize): HoltWinters fitted level parameter not bounded between 0 and 1. Looking at the source code for HoltWinters, I see the author used L-BFGS-B... so, maybe this is also an example of that method going out of bounds.

The R code leading to the bug:
... error <- function(p) hw(p, beta, gamma)$SSE sol <- optim(optim.start["alpha"], error, method = "L-BFGS-B", lower = 0, upper = 1, control = optim.control) alpha <- sol$par ...

Wednesday, May 14, 2008

Besag

I think I will learn a lot from this: Markov Chain Monte Carlo for Statistical Inference (2000) (2001) by Julian Besag

[R] books about MCMC to use MCMC R packages?

---

Computing the mean in R (2.6.0):

 case REALSXP:
     PROTECT(ans = allocVector(REALSXP, 1));
     for (i = 0; i < n; i++) s += REAL(x)[i];
     s /= n;
     if(R_FINITE((double)s)) {
  for (i = 0; i < n; i++) t += (REAL(x)[i] - s);
  s += t/n;
     }
     REAL(ans)[0] = s;
     break;

where t and s were previously initialized to 0.0. (Again I wonder if it would make a noticeable difference to cache the result of REAL(x).) So, in exact math, at the end of the loop t = sum(x) - sum(x) = 0. What does this accomplish in finite precision math?
> y <- seq(-1e6, 1e6, len=1000) > mean(y) [1] -1.433398e-11 > sum(y) / 1000 [1] -1.44355e-11
Hm.

And what does Recall (main/eval.c) look like on the inside?

SEXP attribute_hidden do_recall(SEXP call, SEXP op, SEXP args, SEXP rho)
{
    RCNTXT *cptr;
    SEXP s, ans ;
    cptr = R_GlobalContext;
    /* get the args supplied */
    while (cptr != NULL) {
 if (cptr->callflag == CTXT_RETURN && cptr->cloenv == rho)
     break;
 cptr = cptr->nextcontext;
    }
    args = cptr->promargs;
    /* get the env recall was called from */
    s = R_GlobalContext->sysparent;
    while (cptr != NULL) {
 if (cptr->callflag == CTXT_RETURN && cptr->cloenv == s)
     break;
 cptr = cptr->nextcontext;
    }
    if (cptr == NULL)
 error(_("'Recall' called from outside a closure"));

    /* If the function has been recorded in the context, use it
       otherwise search for it by name or evaluate the expression
       originally used to get it.
    */
    if (cptr->callfun != R_NilValue)
 PROTECT(s = cptr->callfun);
    else if( TYPEOF(CAR(cptr->call)) == SYMSXP)
 PROTECT(s = findFun(CAR(cptr->call), cptr->sysparent));
    else
 PROTECT(s = eval(CAR(cptr->call), cptr->sysparent));
    ans = applyClosure(cptr->call, s, args, cptr->sysparent, R_BaseEnv);
    UNPROTECT(1);
    return ans;
}

Renaming a function screws up a recursive function defined in terms of its own name:
> fact <- function(n) if (n > 1) n * fact(n - 1) else 1 > fact(3) [1] 6 > ffact <- fact > fact <- function(x) 0 ##Whoops! > ffact(3) [1] 0
But not one defined in terms of Recall:
> fact <- function(n) if (n > 1) n * Recall(n - 1) else 1 > fact(3) [1] 6 > ffact <- fact > fact <- function(x) 0 ##Whoops! > ffact(3) [1] 6
It occurs to me that I've never written a recursive function in R before. It looks like Recall gets a modest amount of use in the base code though.

Tuesday, May 13, 2008

JavaScript: Fill in fields from given data

Fill in input elements

I'm not happy with this code. I think it's much more verbose than it needs to be. More important though it's not flexible enough. And maybe most important, it doesn't allow you to undo its actions. I'm writing this with a view to make it easier to input data into WebCT. Not surprisingly, the current greasemonkey scripts aimed at sprucing up WebCT seem to be aimed at making life more pleasant just for students.

//I wish Blogger was a little smarter about '<' in javascript URLs -- or do some browsers barf on this as well?

//I'll have to look at this some more later: http://www.quirksmode.org/

//And don't worry -- I've also been thinking about the Ising and Potts models. I'm just about to check out Grimmett's book and will pursue an idea on how to generate Ising-distributed variates in a bit. However, I cannot believe that others would've missed this method before so I suspect I've missed something. I will certainly learn something from the attempt, however. I'm looking at Besag's papers for some details on pseudo-likelihood. From the other students, I learned it's covered in 601 so I will glance at Kaiser's notes for that.

Our code seems to be spinning along nicely. I've started a second run on real data and will do some quick runs on all the remaining real data sets tonight.

Monday, May 12, 2008

NULL

NULL is defined in stddef.h

To preserve capitalization of elements of a title in a LaTeX bibliography, eg, acryonyms, surround them in curly braces.

Generalized Linear Mixed Models, etc.

There has recently been a very interesting thread on generalized linear mixed models on R-sig-ME. Ken Beath writes, "For choosing the number of classes for mixture models it has been shown that BIC fails theoretically but works well in practice (proven with simulations) and compares well to results from parametric bootstrapping of the LRT." Where should one look for discussion of this?

Dimitris Rizopoulos writes, "Well, in Linear Mixed Models the integral over the random effects can be analytically evaluated and thus no approximation (i.e., AGQ or Laplace) is required. In GLMMs this is not the case and thus the log-likelihood needs to be calculated approximately. One method for approximating the integral is the AGQ, and in fact Laplace is AGQ with one quadrature point." Should a stat computing class cover all this?

Doug Bates writes, "That is, you gain more from using the Laplace approximation at the conditional modes of the random effects (which is the 'adaptive' part) than increasing the number of Gauss-Hermite quadrature points at the marginal modes. The tricky bit of AGQ or Laplace is determining the conditional modes and that part is already done." I'll carefully read through the source code for lme4 sometime soon I think. Doesn't this sound like fun?

Peter Dalgaard gives describes some issues related to generalized linear mixed models. The Laplace method is a trick from Bayesian methods based on the observation that the likelihood approaches a Gaussian -- "Approximate using value at peak and Hessian."

Some fun R functions:

> embed(1:5, dim=3)
     [,1] [,2] [,3]
[1,]    3    2    1
[2,]    4    3    2
[3,]    5    4    3
> embed(1:5, dim=2)
     [,1] [,2]
[1,]    2    1
[2,]    3    2
[3,]    4    3
[4,]    5    4

> as.integer(intToBits(as.integer(3))) [1] 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Using intToBits, we can write a function to print out the bits of a floating point number (ignoring exceptional cases such as Inf):

intToBitString <- function(x, bits=32) {
  substring(paste(rev(as.integer(intToBits(as.integer(x)))),
              collapse=""),
            32 - bits + 1, 32)
}

doubleToBitString <- function(x) {
  stopifnot(is.finite(x))
  e <- floor(log2(x))
  m <- (abs(x * 2 ^ -e) - 1)
  shift <- 8
  s.bits <- ""
  remaining <- .Machine$double.digits
  while (remaining > 0) {
    u <- min(remaining, shift)
    m <- m * 2 ^ u
    k <- floor(m)
    s.bits <- paste(s.bits, intToBitString(k, u), sep="")
    m <- m - k
    remaining <- remaining - u
  }
  
  mexp <- .Machine$double.exponent

  paste(if (x > 0) 0 else 1,
         intToBitString(e + 2^(mexp - 1) - 1, mexp),
         s.bits)
}

(We could easily avoid depending on the base being 2, but is it worth it?) The output for pi seems to match that given for pi on Wikipedia page.

> doubleToBitString(pi) [1] "0 10000000000 10010010000111111011010101000100010000101101000110000"
Here, just for kicks, is an implementation using the hack of dynamically creating a shared library to accomplish this task. (I wrote this a couple years ago in response to a class assignment to print out the bits of a floating point number in R. Well, the code I actually turned in was pure R -- and way longer.)

create.so <- function() {
  writeLines(c("void doubleToBitString(const double* x,"
               " int* length, char** result)",
               "{",
               "  const char* bytes = (const void*)x;",
               "  int numBytes = sizeof(double);",
               "  static char buffer[1024];",
               "  int i;",
               "  for (i = 0; i != numBytes; ++i) {",
               "    int j;",
               "    for (j = 0; j != 8; ++j) {",
               "      int mask = 1 << j;",
               "      buffer[8*i + j] = (mask & bytes[i]) ? \'1\' : \'0\';",
               "    }",
               "  }",
               "  buffer[8*numBytes] = \'\\0\';",
               "  *result = buffer;",
               "  *length = numBytes;",
               "}"),
             con = "bitstring.c")
  system("R CMD SHLIB bitstring.c", ignore.stderr=T)
}

doubleToBitString <- function(x) {
  if (!file.exists("bitstring.so")) {
    create.so()
  }
  dyn.load("bitstring.so")
  out <- .C("doubleToBitString",
            x=as.double(x),
            len=as.integer(1),
            bits=as.character('junk'))
  return(out$bits)
}

Whoa. Does this write into memory it shouldn't be touching?

Tuesday, May 6, 2008

C unit testing

I looked around at unit testing frameworks for C today. CppUnit caught my attention because it mentions being written by Michael Feathers. However, I'm working on code for someone who is not a fan of C++ so I decided it would be better to aim for a pure C solution. I'm trying out CuTest. It seems it will fulfill my needs. (Check sounded better to me and seems to have seen some love and care more recently, but the configure script failed to work on my computer, and I didn't want to devote much time to just getting the framework itself to compile.) CuTest seems all right so far. I wish the library functions made it bit easier to programmatically access the test results, however. For instance, I would like the program to exit with status 1 if any of the tests fail (or make some other externally obvious sign that doesn't require parsing the text output). (Would it be bad style to use exit status for this?)

frailty/random effects in survreg

library(survival) rsev <- function(n) log(-log(runif(n))) x <- rnorm(1000) v <- rep(1:100, each=10) q <- rep(rnorm(100, sd=2), each=10) y <- exp(1.0 + 3.0 * x + q + 5.0 * rsev(1000)) z <- rep(1, 1000) d <- data.frame(x=x, v=as.factor(v)) f <- survreg(Surv(y, z) ~ x + frailty.gaussian(v), data=d) f

Call:
survreg(formula = Surv(y, z) ~ x + frailty.gaussian(v), data = d)

                    coef se(coef) se2   Chisq DF   p      
(Intercept)         1.20 0.254    0.156  22.3  1.0 2.4e-06
x                   2.96 0.166    0.162 319.3  1.0 0.0e+00
frailty.gaussian(v)                     176.6 63.2 1.2e-12

Scale= 4.7 

Iterations: 7 outer, 23 Newton-Raphson
     Variance of random effect= 4 
Degrees of freedom for terms=  0.4  1.0 63.2  1.0 
Likelihood ratio test=463  on 63.5 df, p=0  n= 1000

Saturday, May 3, 2008

reading numbers from a file -- less buggy?

My previous readArray implementation contained a bug. I had implemented in terms of line-oriented input. However, this meant specifying a maximum line length (in characters). I thought 10000 characters was more than reasonable... well, it turns out my own code routinely prints out lines of numbers that are longer than that. (This is itself probably itself something that should change.) So, fgets was actually hitting this limit, while in the midst of reading in part of a number, splitting it into two parts, the next part showing up on the next fgets call, thus occasionally introducing two invalid numbers. If this hadn't introduced an extra number (and my code hadn't been checking it was getting the right number of numbers), I might have missed this.

One way to fix this would be to implement a function for line-oriented input that would guarantee the entire line had been read -- very close to the spirit of readArray. Instead though I've chosen to give up on line-oriented input and instead read things in a number at a time. The resulting code is a bit cleaner I think anyway (although there is still some duplication).

(Another fix would be to use a language for which reading a file full of a given data type is a feature of the standard library. I guess we almost have this with C++. Haskell must get this right, though, right?)

#include <stdlib.h>

/**
 * Reads doubles from in until EOF.
 * If le n is not null, *len is set to
 * the number of elements read.
 * The caller is responsible for freeing the 
 * memory of the returned object.
 * Returns null in case of an error.
 */
double* readArray(FILE* in, int* len)
{
  int capacity = 100;
  double* y = malloc(sizeof(double) * capacity);
  int n = 0;
  while (!ferror(in) && !feof(in)) {
    double v;
    int status = fscanf(in, "%lf", &v);
    if (status == 1) {
      if (capacity == n) {
 capacity *= 2;
 y = realloc(y, capacity * sizeof(double));
      }
      
      y[n] = v;
      ++n;
    }
    else if (status == 0) {
      fprintf(stderr,
              "# Invalid input? (read %d values so far)\n",
       n);
      free(y);
      return 0;
    }
    else if (status == EOF && !feof(in)) {
      fprintf(stderr,
             "# Got error (read %d values so far)\n", n);
      perror("");
      free(y);
      return 0;
    }
  }

  if (len) {
    *len = n;
  }

  return y;
}