How to aggregate a data.frame on both row and column names based on a hierarchical dictionary name structure?

Continuing from df[df > 0] <- 1

library(reshape)
library(reshape2)
library(data.table)

# incorporating @RicardoSaporta's suggestion of using data.table(keep.rownames = TRUE)
dt <- data.table(as.matrix(df) %*% t(as.matrix(df)), keep.rownames = TRUE)

#reducing matrix format to plain data format, look at dt to see the change
dt <- melt(dt, "rn")

#getting positive/negative for word1 and word2
dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "rn")
dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "variable", suffixes = c("_1","_2"))


#getting counts for each positive/negative - positive/negative combination
dt <- data.table(dt)
dt[,list(value = sum(value)), by = c("level1_1","level1_2")]

#structuring
cast(dt,level1_1~level1_2, fun.aggregate=sum)

Output

> cast(dt,level1_1~level1_2, fun.aggregate=sum)
  level1_1 Negative Positive
1 Negative       48       42
2 Positive       42       45

Basically same solution as the other two so far, just a bit more compact and probably a bit faster:

library(reshape2)
library(data.table)

mdt = data.table(melt(m), key = 'Var1')
dic = data.table(dictionary, key = 'level2')

dcast(dic[setkey(dic[mdt], Var2)], level1 ~ level1.1, fun.aggregate = sum)
#    level1 Negative Positive
#1 Negative       48       42
#2 Positive       42       45

You could go back a step, doing the aggregation on the adjacency matrix before creating the co-occurence matrix:

dict <- data.table(dictionary,key='level2')
adj2 <- data.table(df,keep.rownames=TRUE)

adj1 <- adj2[,lapply(.SD,sum),by=dict[rn]$level1]

# one tedious step:
adj1mat           <- as.matrix(adj1[,-1])
rownames(adj1mat) <- as.character(adj1$dict)

m1   <- adj1mat %*% t(adj1mat)

#          Positive Negative
# Positive       45       42
# Negative       42       48

It will make sense to have your dictionary stored as a keyed data.table anyway, I expect.

Tags:

R

Data.Table