data.table中的重塑难题

时间:2021-08-09 20:11:30

Yet another reshape problem in data.table

data.table中的另一个重塑问题

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
#    x y  v
# 1: 1 A 12
# 2: 1 B 62
...
#11: 3 A 63
#12: 3 B 49

I would like to do a cumulative sum of x and v by y but the result to be presented as: The number of lines always stays the same, and when y==A the SUM.*.A is incremented, same when y==B. (As usual y could have many factors, 2 in this example)

我想用y做一个x和v的累加和,但结果表示为:行数始终保持不变,当y == A时,SUM。*。A递增,当y =时= B。 (像往常一样,y可能有很多因素,在本例中为2)

#     SUM.x.A SUM.x.B  SUM.v.A SUM.v.B
# 1:        1      NA       12      NA
# 2:        1       1       12      62
...
#11:       12       9       318     289
#12:       12      12       318     338

EDIT: Here is my poor solution clearly overly complicated

编辑:这是我糟糕的解决方案显然过于复杂

#first step is to create cumsum columns
colNames <- c("x","v"); newColNames <- paste0("SUM.",colNames)
DT[, newColNames:=lapply(.SD,cumsum) ,by=y, .SDcols=colNames, with=F];
#now we need to reshape each SUM.* to SUM.*.{yvalue}
DT[,N:=.I]; setattr(DT,"sorted","N")

g <- function(DT,SD){
  cols <- c('N',grep('SUM',colnames(SD), value=T));
  Yval <- unique(SD[,y]);
  merge(DT, SD[,cols, with=F], suffixe=c('',paste0('.',Yval)), all.x=T);    
}

DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));

locf = function(x) {
  ind = which(!is.na(x))    
  if(is.na(x[1])) ind = c(1,ind)
  rep(x[ind], times = diff( c(ind, length(x) + 1) )) 
}

newColNames <- grep('SUM',colnames(DT),value=T);
DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]

3 个解决方案

#1


5  

Try this:

尝试这个:

cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))

Simplifications:

简化:

1) If using 0 in place of NA is ok then it can be simplified by omitting the first line which defines cumsum0 and replacing cumsum0 in the next line with cumsum.

1)如果使用0代替NA是可以的,那么可以通过省略定义cumsum0的第一行并用cumsum替换下一行中的cumsum0来简化它。

2) The result of the second line has these names:

2)第二行的结果具有以下名称:

> names(DT2)
[1] "SUM.A:x" "SUM.B:x" "SUM.A:v" "SUM.B:v"

so if that is sufficient the last line can be dropped since its only purpose is to make the names exactly the same as in the question.

因此,如果这足够,则可以删除最后一行,因为它的唯一目的是使名称与问题中的名称完全相同。

The result (without the simplifications) is:

结果(没有简化)是:

> DT2
    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
 1:       1      NA      12      NA
 2:       1       1      12      62
 3:       2       1      72      62
 4:       2       2      72     123
 5:       4       2     155     123
 6:       4       4     155     220
 7:       6       4     156     220
 8:       6       6     156     242
 9:       9       6     255     242
10:       9       9     255     289
11:      12       9     318     289
12:      12      12     318     338

#2


4  

Here's another way:

这是另一种方式:

ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
    cols <- paste0("SUM.", sdcols, ".", ys[i])
    DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
    DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
    c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
    setnames(DT, c("v1", "v2"), cols)
}

My version of benchmarking with mnel's (from his post) and this function:

我使用mnel(来自他的帖子)和这个函数的基准测试版本:

The function from this post:

arun <- function(DT) {

    ys <- unique(DT$y)
    sdcols <- c("x", "v")
    cols <- paste0("SUM.", sdcols)
    DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
    for( i in seq_along(ys)) {
        cols <- paste0("SUM.", sdcols, ".", ys[i])
        DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
        DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
        c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
        setnames(DT, c("v1", "v2"), cols)
    }
    DT
}

Function from mnel's post:

mnel <- function(DT) {
    set.seed(1234)
    DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
    DT[, id := seq_len(nrow(DT))]
    setkey(DT, y)
    uniqY <- unique(DT$y)
    for(jj in uniqY){
      nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
      DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

    }
    setkey(DT, id)
    DT[, 5:8 := lapply(.SD, function(x) { 
      xn <- is.na(x)
      x[xn] <- -Inf
      xx <- cummax(x)
      # deal with leading NA values
        if(xn[1]){
        xn1 <- which(xn)[1]
      xx[seq_len(xn1)] <- NA}   
      xx }), .SDcols = 5:8]
}

Function from statquant:

statquant <- function(DT){
    #first step is to create cumsum columns
    colNames <- c("x","v"); newColNames <- paste0("SUM.",colNames)
    DT[, newColNames:=lapply(.SD,cumsum) ,by=y, .SDcols=colNames, with=F];
    #now we need to reshape each SUM.* to SUM.*.{yvalue}
    DT[,N:=.I]; setattr(DT,"sorted","N")

    g <- function(DT,SD){
      cols <- c('N',grep('SUM',colnames(SD), value=T));
      Yval <- unique(SD[,y]);
      merge(DT, SD[,cols, with=F], suffixe=c('',paste0('.',Yval)), all.x=T);    
    }

    DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));

    locf = function(x) {
      ind = which(!is.na(x))    
      if(is.na(x[1])) ind = c(1,ind)
      rep(x[ind], times = diff( c(ind, length(x) + 1) )) 
    }

    newColNames <- grep('SUM',colnames(DT),value=T);
    DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
    DT
}

Function from grothendieck

grothendieck <- function(DT) {
    cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
    DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
    setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
    DT2
}

Benchmarking:

library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))

library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)), 
                m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)

# Unit: milliseconds
#                         expr       min        lq    median        uq       max neval
#     s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186  1000
#  g <- grothendieck(copy(DT))  3.634120  3.859143  4.006085  4.443388  80.01984  1000
#          m <- mnel(copy(DT))  7.819286  8.234178  8.596090 10.423668  87.07668  1000
#          a <- arun(copy(DT))  6.925419  7.369286  7.703003  9.262719  53.39823  1000

resulting data.table "a" (arun's)

#     x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1: 1 A 12     1    12       1      12      NA      NA
#  2: 1 B 62     1    62       1      12       1      62
#  3: 1 A 60     2    72       2      72       1      62
#  4: 1 B 61     2   123       2      72       2     123
#  5: 2 A 83     4   155       4     155       2     123
#  6: 2 B 97     4   220       4     155       4     220
#  7: 2 A  1     6   156       6     156       4     220
#  8: 2 B 22     6   242       6     156       6     242
#  9: 3 A 99     9   255       9     255       6     242
# 10: 3 B 47     9   289       9     255       9     289
# 11: 3 A 63    12   318      12     318       9     289
# 12: 3 B 49    12   338      12     318      12     338

Resulting data.table "m" (mnel's)

#    x y  v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
#  1: 1 A 12  1       1      12      NA      NA
#  2: 1 B 62  2       1      12       1      62
#  3: 1 A 60  3       2      72       1      62
#  4: 1 B 61  4       2      72       2     123
#  5: 2 A 83  5       4     155       2     123
#  6: 2 B 97  6       4     155       4     220
#  7: 2 A  1  7       6     156       4     220
#  8: 2 B 22  8       6     156       6     242
#  9: 3 A 99  9       9     255       6     242
# 10: 3 B 47 10       9     255       9     289
# 11: 3 A 63 11      12     318       9     289
# 12: 3 B 49 12      12     318      12     338

Resulting data.table "s" (statquant's)

#      N x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1:  1 1 A 12     1    12       1      12      NA      NA
#  2:  2 1 B 62     1    62       1      12       1      62
#  3:  3 1 A 60     2    72       2      72       1      62
#  4:  4 1 B 61     2   123       2      72       2     123
#  5:  5 2 A 83     4   155       4     155       2     123
#  6:  6 2 B 97     4   220       4     155       4     220
#  7:  7 2 A  1     6   156       6     156       4     220
#  8:  8 2 B 22     6   242       6     156       6     242
#  9:  9 3 A 99     9   255       9     255       6     242
# 10: 10 3 B 47     9   289       9     255       9     289
# 11: 11 3 A 63    12   318      12     318       9     289
# 12: 12 3 B 49    12   338      12     318      12     338

Resulting data.table "g" (grothendieck's)

#    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
#  1:       1      NA      12      NA
#  2:       1       1      12      62
#  3:       2       1      72      62
#  4:       2       2      72     123
#  5:       4       2     155     123
#  6:       4       4     155     220
#  7:       6       4     156     220
#  8:       6       6     156     242
#  9:       9       6     255     242
# 10:       9       9     255     289
# 11:      12       9     318     289
# 12:      12      12     318     338

#3


3  

Not sure this is the best solution, but you could do something like the following.

不确定这是最好的解决方案,但您可以执行以下操作。

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]

setkey(DT, y)

uniqY <- unique(DT$y)

for(jj in uniqY){
  nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
  DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

}

setkey(DT, id)

DT[, 5:8 := lapply(.SD, function(x) { 
  xn <- is.na(x)
  x[xn] <- -Inf
  xx <- cummax(x)
  # deal with leading NA values
    if(xn[1]){
    xn1 <- which(xn)[1]
  xx[seq_len(xn1)] <- NA}   

  xx }), .SDcols = 5:8]

#1


5  

Try this:

尝试这个:

cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))

Simplifications:

简化:

1) If using 0 in place of NA is ok then it can be simplified by omitting the first line which defines cumsum0 and replacing cumsum0 in the next line with cumsum.

1)如果使用0代替NA是可以的,那么可以通过省略定义cumsum0的第一行并用cumsum替换下一行中的cumsum0来简化它。

2) The result of the second line has these names:

2)第二行的结果具有以下名称:

> names(DT2)
[1] "SUM.A:x" "SUM.B:x" "SUM.A:v" "SUM.B:v"

so if that is sufficient the last line can be dropped since its only purpose is to make the names exactly the same as in the question.

因此,如果这足够,则可以删除最后一行,因为它的唯一目的是使名称与问题中的名称完全相同。

The result (without the simplifications) is:

结果(没有简化)是:

> DT2
    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
 1:       1      NA      12      NA
 2:       1       1      12      62
 3:       2       1      72      62
 4:       2       2      72     123
 5:       4       2     155     123
 6:       4       4     155     220
 7:       6       4     156     220
 8:       6       6     156     242
 9:       9       6     255     242
10:       9       9     255     289
11:      12       9     318     289
12:      12      12     318     338

#2


4  

Here's another way:

这是另一种方式:

ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
    cols <- paste0("SUM.", sdcols, ".", ys[i])
    DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
    DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
    c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
    setnames(DT, c("v1", "v2"), cols)
}

My version of benchmarking with mnel's (from his post) and this function:

我使用mnel(来自他的帖子)和这个函数的基准测试版本:

The function from this post:

arun <- function(DT) {

    ys <- unique(DT$y)
    sdcols <- c("x", "v")
    cols <- paste0("SUM.", sdcols)
    DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
    for( i in seq_along(ys)) {
        cols <- paste0("SUM.", sdcols, ".", ys[i])
        DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
        DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
        c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
        setnames(DT, c("v1", "v2"), cols)
    }
    DT
}

Function from mnel's post:

mnel <- function(DT) {
    set.seed(1234)
    DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
    DT[, id := seq_len(nrow(DT))]
    setkey(DT, y)
    uniqY <- unique(DT$y)
    for(jj in uniqY){
      nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
      DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

    }
    setkey(DT, id)
    DT[, 5:8 := lapply(.SD, function(x) { 
      xn <- is.na(x)
      x[xn] <- -Inf
      xx <- cummax(x)
      # deal with leading NA values
        if(xn[1]){
        xn1 <- which(xn)[1]
      xx[seq_len(xn1)] <- NA}   
      xx }), .SDcols = 5:8]
}

Function from statquant:

statquant <- function(DT){
    #first step is to create cumsum columns
    colNames <- c("x","v"); newColNames <- paste0("SUM.",colNames)
    DT[, newColNames:=lapply(.SD,cumsum) ,by=y, .SDcols=colNames, with=F];
    #now we need to reshape each SUM.* to SUM.*.{yvalue}
    DT[,N:=.I]; setattr(DT,"sorted","N")

    g <- function(DT,SD){
      cols <- c('N',grep('SUM',colnames(SD), value=T));
      Yval <- unique(SD[,y]);
      merge(DT, SD[,cols, with=F], suffixe=c('',paste0('.',Yval)), all.x=T);    
    }

    DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));

    locf = function(x) {
      ind = which(!is.na(x))    
      if(is.na(x[1])) ind = c(1,ind)
      rep(x[ind], times = diff( c(ind, length(x) + 1) )) 
    }

    newColNames <- grep('SUM',colnames(DT),value=T);
    DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
    DT
}

Function from grothendieck

grothendieck <- function(DT) {
    cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
    DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
    setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
    DT2
}

Benchmarking:

library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))

library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)), 
                m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)

# Unit: milliseconds
#                         expr       min        lq    median        uq       max neval
#     s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186  1000
#  g <- grothendieck(copy(DT))  3.634120  3.859143  4.006085  4.443388  80.01984  1000
#          m <- mnel(copy(DT))  7.819286  8.234178  8.596090 10.423668  87.07668  1000
#          a <- arun(copy(DT))  6.925419  7.369286  7.703003  9.262719  53.39823  1000

resulting data.table "a" (arun's)

#     x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1: 1 A 12     1    12       1      12      NA      NA
#  2: 1 B 62     1    62       1      12       1      62
#  3: 1 A 60     2    72       2      72       1      62
#  4: 1 B 61     2   123       2      72       2     123
#  5: 2 A 83     4   155       4     155       2     123
#  6: 2 B 97     4   220       4     155       4     220
#  7: 2 A  1     6   156       6     156       4     220
#  8: 2 B 22     6   242       6     156       6     242
#  9: 3 A 99     9   255       9     255       6     242
# 10: 3 B 47     9   289       9     255       9     289
# 11: 3 A 63    12   318      12     318       9     289
# 12: 3 B 49    12   338      12     318      12     338

Resulting data.table "m" (mnel's)

#    x y  v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
#  1: 1 A 12  1       1      12      NA      NA
#  2: 1 B 62  2       1      12       1      62
#  3: 1 A 60  3       2      72       1      62
#  4: 1 B 61  4       2      72       2     123
#  5: 2 A 83  5       4     155       2     123
#  6: 2 B 97  6       4     155       4     220
#  7: 2 A  1  7       6     156       4     220
#  8: 2 B 22  8       6     156       6     242
#  9: 3 A 99  9       9     255       6     242
# 10: 3 B 47 10       9     255       9     289
# 11: 3 A 63 11      12     318       9     289
# 12: 3 B 49 12      12     318      12     338

Resulting data.table "s" (statquant's)

#      N x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1:  1 1 A 12     1    12       1      12      NA      NA
#  2:  2 1 B 62     1    62       1      12       1      62
#  3:  3 1 A 60     2    72       2      72       1      62
#  4:  4 1 B 61     2   123       2      72       2     123
#  5:  5 2 A 83     4   155       4     155       2     123
#  6:  6 2 B 97     4   220       4     155       4     220
#  7:  7 2 A  1     6   156       6     156       4     220
#  8:  8 2 B 22     6   242       6     156       6     242
#  9:  9 3 A 99     9   255       9     255       6     242
# 10: 10 3 B 47     9   289       9     255       9     289
# 11: 11 3 A 63    12   318      12     318       9     289
# 12: 12 3 B 49    12   338      12     318      12     338

Resulting data.table "g" (grothendieck's)

#    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
#  1:       1      NA      12      NA
#  2:       1       1      12      62
#  3:       2       1      72      62
#  4:       2       2      72     123
#  5:       4       2     155     123
#  6:       4       4     155     220
#  7:       6       4     156     220
#  8:       6       6     156     242
#  9:       9       6     255     242
# 10:       9       9     255     289
# 11:      12       9     318     289
# 12:      12      12     318     338

#3


3  

Not sure this is the best solution, but you could do something like the following.

不确定这是最好的解决方案,但您可以执行以下操作。

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]

setkey(DT, y)

uniqY <- unique(DT$y)

for(jj in uniqY){
  nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
  DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

}

setkey(DT, id)

DT[, 5:8 := lapply(.SD, function(x) { 
  xn <- is.na(x)
  x[xn] <- -Inf
  xx <- cummax(x)
  # deal with leading NA values
    if(xn[1]){
    xn1 <- which(xn)[1]
  xx[seq_len(xn1)] <- NA}   

  xx }), .SDcols = 5:8]