Select the first row by group

You can use duplicated to do this very quickly.

test[!duplicated(test$id),]

Benchmarks, for the speed freaks:

ju <- function() test[!duplicated(test$id),]
gs1 <- function() do.call(rbind, lapply(split(test, test$id), head, 1))
gs2 <- function() do.call(rbind, lapply(split(test, test$id), `[`, 1, ))
jply <- function() ddply(test,.(id),function(x) head(x,1))
jdt <- function() {
  testd <- as.data.table(test)
  setkey(testd,id)
  # Initial solution (slow)
  # testd[,lapply(.SD,function(x) head(x,1)),by = key(testd)]
  # Faster options :
  testd[!duplicated(id)]               # (1)
  # testd[, .SD[1L], by=key(testd)]    # (2)
  # testd[J(unique(id)),mult="first"]  # (3)
  # testd[ testd[,.I[1L],by=id] ]      # (4) needs v1.8.3. Allows 2nd, 3rd etc
}

library(plyr)
library(data.table)
library(rbenchmark)

# sample data
set.seed(21)
test <- data.frame(id=sample(1e3, 1e5, TRUE), string=sample(LETTERS, 1e5, TRUE))
test <- test[order(test$id), ]

benchmark(ju(), gs1(), gs2(), jply(), jdt(),
    replications=5, order="relative")[,1:6]
#     test replications elapsed relative user.self sys.self
# 1   ju()            5    0.03    1.000      0.03     0.00
# 5  jdt()            5    0.03    1.000      0.03     0.00
# 3  gs2()            5    3.49  116.333      2.87     0.58
# 2  gs1()            5    3.58  119.333      3.00     0.58
# 4 jply()            5    3.69  123.000      3.11     0.51

Let's try that again, but with just the contenders from the first heat and with more data and more replications.

set.seed(21)
test <- data.frame(id=sample(1e4, 1e6, TRUE), string=sample(LETTERS, 1e6, TRUE))
test <- test[order(test$id), ]
benchmark(ju(), jdt(), order="relative")[,1:6]
#    test replications elapsed relative user.self sys.self
# 1  ju()          100    5.48    1.000      4.44     1.00
# 2 jdt()          100    6.92    1.263      5.70     1.15

I favor the dplyr approach.

group_by(id) followed by either

  • filter(row_number()==1) or
  • slice(1) or
  • slice_head(1) #(dplyr => 1.0)
  • top_n(n = -1)
    • top_n() internally uses the rank function. Negative selects from the bottom of rank.

In some instances arranging the ids after the group_by can be necessary.

library(dplyr)

# using filter(), top_n() or slice()

m1 <-
test %>% 
  group_by(id) %>% 
  filter(row_number()==1)

m2 <-
test %>% 
  group_by(id) %>% 
  slice(1)

m3 <-
test %>% 
  group_by(id) %>% 
  top_n(n = -1)

All three methods return the same result

# A tibble: 5 x 2
# Groups:   id [5]
     id string
  <int> <fct> 
1     1 A     
2     2 B     
3     3 C     
4     4 D     
5     5 E

Tags:

R

Dataframe

Sqldf