Efficient calculation of var-covar matrix in R

@F. Privé's Rcpp implementation is a good starting place, but we can do better. You will notice in the main algorithm supplied by the OP that there are many replicated fairly expensive calculations. Observe:

OPalgo <- function(m, p, ind1, n) {
    vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
    for (i in 0L:n) {
        for (j in i:n) {
            ## lower and upper range for the first & second multiplicand
            print(paste(c((1L + (j - i)),":",(periods - i)," 
                          ",1L,":",(periods - j)), collapse = ""))

            vcov[j + 1L, i + 1L] <- 
                sum(mat[, (1L + (j - i)):(periods - i)] *
                                mat[, 1L:(periods - j)]) /
                                    (ind * (periods - j) - 1)
        }
    }
    vcov
}

OPalgo(mat, periods, ind, n_lags)
[1] "1:70 1:70"  ## contains "1:65 1:65"
[1] "2:70 1:69"
[1] "3:70 1:68"
[1] "4:70 1:67"
[1] "5:70 1:66"
[1] "6:70 1:65"
[1] "1:69 1:69"  ## contains "1:65 1:65"
[1] "2:69 1:68"
[1] "3:69 1:67"
[1] "4:69 1:66"
[1] "5:69 1:65"
[1] "1:68 1:68"  ## contains "1:65 1:65"
[1] "2:68 1:67"
[1] "3:68 1:66"
[1] "4:68 1:65"
[1] "1:67 1:67"  ## contains "1:65 1:65"
[1] "2:67 1:66"
[1] "3:67 1:65"
[1] "1:66 1:66"  ## contains "1:65 1:65"
[1] "2:66 1:65"
[1] "1:65 1:65"

As you can see, the product mat[,1:65] * mat[,1:65] is performed 6 times above. The only difference between the first occurrence and the last occurrence is that the first occurrence has an additional 5 columns. So instead of computing:

sum(mat[ , 1:70] * mat[ , 1:70])
sum(mat[ , 1:69] * mat[ , 1:69])
sum(mat[ , 1:68] * mat[ , 1:68])
sum(mat[ , 1:67] * mat[ , 1:67])
sum(mat[ , 1:66] * mat[ , 1:66])
sum(mat[ , 1:65] * mat[ , 1:65])

We can compute preCalc[1] <- sum(mat[ , 1:65] * mat[ , 1:65]) one time and use this in the other 5 calculations like so:

preCalc[1] + sum(mat[ , 66:70] * mat[ , 66:70])
preCalc[1] + sum(mat[ , 66:69] * mat[ , 66:69])
preCalc[1] + sum(mat[ , 66:68] * mat[ , 66:68])
preCalc[1] + sum(mat[ , 66:67] * mat[ , 66:67])
preCalc[1] + sum(mat[ , 66:66] * mat[ , 66:66])

In each of the above, we have reduce the number of multiplications by 90000 * 65 = 5,850,000 and the number of additions by 5,850,000 - 1 = 5,849,999 for a total of 11,699,999 arithmetic operations saved. The function below achieves this very thing.

fasterAlgo <- function(m, p, ind1, n) {
    vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
    preCals <- vapply(1:(n + 1L), function(x) sum(m[ , x:(p - n + x - 2L)] * 
                                               m[ , 1L:(p - n - 1L)]), 42.42)
    for (i in 0L:n) {
        for (j in i:n) {
            myNum <- preCals[1L + j - i] + sum(m[, (p - n + j - i):(p - i)] * m[, (p - n):(p - j)])
            vcov[j + 1L, i + 1L] <- myNum / (ind * (p - j) - 1)
        }
    }
    vcov
}

## outputs same results
all.equal(OPalgo(mat, periods, ind, n_lags), fasterAlgo(mat, periods, ind, n_lags))
[1] TRUE

Benchmarks:

## I commented out the print statements of the OPalgo before benchmarking
library(microbenchmark)
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
               fasterBase = fasterAlgo(mat, periods, ind, n_lags),
               RcppOrig = compute_vcov(mat, n_lags), times = 5)
Unit: milliseconds
        expr       min        lq      mean   median        uq       max neval cld
          OP 2775.6110 2780.7207 2843.6012 2784.976 2899.7621 2976.9356     5   c
  fasterBase  863.3897  863.9681  865.5576  865.593  866.7962  868.0409     5  b 
    RcppOrig  160.1040  161.8922  162.0153  162.235  162.4756  163.3697     5 a  

As you can see, with this modification we see at least a 3 fold improvement but the Rcpp is still much faster. Let's implement the above concept in Rcpp.

// [[Rcpp::export]]
NumericMatrix compute_vcov2(const NumericMatrix& mat, int n_lags) {

    NumericMatrix vcov(n_lags + 1, n_lags + 1);
    std::vector<double> preCalcs;
    preCalcs.reserve(n_lags + 1);
    double myCov;

    int i, j, k1, k2, l;
    int n = mat.nrow();
    int m = mat.ncol();

    for (i = 0; i <= n_lags; i++) {
        myCov = 0;
        for (k1 = i, k2 = 0; k2 < (m - n_lags - 1); k1++, k2++) {
            for (l = 0; l < n; l++) {
                myCov += mat(l, k1) * mat(l, k2); 
            }
        }
        preCalcs.push_back(myCov);
    }

    for (i = 0; i <= n_lags; i++) {
        for (j = i; j <= n_lags; j++) {
            myCov = preCalcs[j - i];
            for (k1 = m - n_lags + j - i - 1, k2 = m - n_lags - 1; k2 < (m - j); k1++, k2++) {
                for (l = 0; l < n; l++) {
                    myCov += mat(l, k1) * mat(l, k2);
                }
            }
            myCov /= n * (m - j) - 1;
            vcov(i, j) = vcov(j, i) = myCov;
        }
    }

    return vcov;
}

## gives same results
all.equal(compute_vcov2(mat, n_lags), compute_vcov(mat, n_lags))
[1] TRUE

New benchmarks:

microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
               fasterBase = fasterAlgo(mat, periods, ind, n_lags),
               RcppOrig = compute_vcov(mat, n_lags), 
               RcppModified = compute_vcov2(mat, n_lags), times = 5)
Unit: milliseconds
        expr       min         lq       mean     median         uq        max neval  cld
          OP 2785.4789 2786.67683 2811.02528 2789.37719 2809.61270 2883.98073     5    d
  fasterBase  866.5601  868.25555  888.64418  869.31796  870.92308  968.16417     5   c 
    RcppOrig  160.3467  161.37992  162.74899  161.73009  164.38653  165.90174     5  b  
RcppModified   51.1641   51.67149   52.87447   52.56067   53.06273   55.91334     5 a   

Now the enhanced Rcpp solution is around 3x faster the original Rcpp solution and around 50x faster than the original algorithm provided by the OP.

Update

We can do even better. We can reverse the ranges of the indices i/j so as to continuously update preCalcs. This allows up to only compute the product of one new column every iteration. This really comes into play as n_lags increases. Observe:

// [[Rcpp::export]]
NumericMatrix compute_vcov3(const NumericMatrix& mat, int n_lags) {

    NumericMatrix vcov(n_lags + 1, n_lags + 1);
    std::vector<double> preCalcs;
    preCalcs.reserve(n_lags + 1);

    int i, j, k1, k2, l;
    int n = mat.nrow();
    int m = mat.ncol();

    for (i = 0; i <= n_lags; i++) {
        preCalcs.push_back(0);
        for (k1 = i, k2 = 0; k2 < (m - n_lags); k1++, k2++) {
            for (l = 0; l < n; l++) {
                preCalcs[i] += mat(l, k1) * mat(l, k2); 
            }
        }
    }

    for (i = n_lags; i >= 0; i--) {  ## reverse range
        for (j = n_lags; j >= i; j--) {   ## reverse range
            vcov(i, j) = vcov(j, i) = preCalcs[j - i] / (n * (m - j) - 1);
            if (i > 0 && i > 0) {
                for (k1 = m - i, k2 = m - j; k2 <= (m - j); k1++, k2++) {
                    for (l = 0; l < n; l++) {
                        ## updating preCalcs vector
                        preCalcs[j - i] += mat(l, k1) * mat(l, k2);  
                    }
                }
            }
        }
    }

    return vcov;
}

all.equal(compute_vcov(mat, n_lags), compute_vcov3(mat, n_lags))
[1] TRUE

Rcpp benchmarks only:

n_lags <- 50L
microbenchmark(RcppOrig = compute_vcov(mat, n_lags),
                 RcppModified = compute_vcov2(mat, n_lags),
                 RcppExtreme = compute_vcov3(mat, n_lags), times = 5)
Unit: milliseconds
        expr       min        lq      mean    median       uq       max neval cld
    RcppOrig 7035.7920 7069.7761 7083.4961 7070.3395 7119.028 7122.5446     5   c
RcppModified 3608.8986 3645.8585 3653.0029 3654.7209 3663.716 3691.8202     5  b 
 RcppExtreme  324.8252  330.7381  332.9657  333.5919  335.168  340.5054     5 a  

The newest implementation is now over 20x faster than the original Rcpp version and well over 300x faster than the original algorithm when n-lags is large.


Just translating your code in Rcpp:

#include <Rcpp.h>
using namespace Rcpp;    

// [[Rcpp::export]]
NumericMatrix compute_vcov(const NumericMatrix& mat, int n_lags) {

  NumericMatrix vcov(n_lags + 1, n_lags + 1);
  double myCov;

  int i, j, k1, k2, l;
  int n = mat.nrow();
  int m = mat.ncol();

  for (i = 0; i <= n_lags; i++) {
    for (j = i; j <= n_lags; j++) {
      myCov = 0;
      for (k1 = j - i, k2 = 0; k2 < (m - j); k1++, k2++) {
        for (l = 0; l < n; l++) {
          myCov += mat(l, k1) * mat(l, k2); 
        }
      }
      myCov /= n * (m - j) - 1;
      vcov(i, j) = vcov(j, i) = myCov;
    }
  }

  return vcov;
}

This is at least 10 times as fast as the R algorithm. Yet, I feel like it could be optimized further.