data.table与pandas语法对比

本文基于谢士晨博客文章进行修改和增添。

行选择与排序

dt[c(3,1,2)]

# 单条件筛选
dt[Hair == 'Red']

# 多条件筛选
dt[Hair == 'Black' & 
   Freq >= 10 & 
   Eye %in% c('Brown', 'Blue')]
df.iloc[[2,0,1]] # python序数从0开始,2代表第三行
df.loc[[2,0,1]] # 如果index未修改,效果与iloc的一致

# 单条件筛选,去掉.loc效果一致
df.loc[df['Hair'] == 'Red'] 

# pandas 多条件筛选时要用 |, &, ~分别代表or, and, not; 
# 且每个条件用括号区分 
df.loc[(df['Hair'] == 'Black') & 
       (df['Freq'] >= 10) & 
       (df['Eye'].isin(['Brown', 'Blue']))]

排序

dt[order(Sex, -Freq)]

setorder(DT, A, -B) # 行排序

setcolorder(DT, c("C", "A", "B")) # 列排序
df.sort_values(
  ['Sex', 'Freq'], 
  ascending = [True, False] )
  
df = df[['user_id','book_id','rating','mark_date']]  # 列排序

列选择

dt[,.(Hair, Freq)]
# or 
dt[,c('Eye', 'Sex'), with=FALSE]
df[['Hair', 'Freq']]
# or 
df.loc[:,['Eye', 'Sex']] # 选一列时也要保留[],否则与df.Eye一样为series

列计算

# 新建一列
dt[, nc := .I] # .I .N .SD为特殊符号,查看帮助?`.I`
dt[,'nc0'] = 1:32

# 新建多列
dt[, `:=`(
  nc1 = 1:32,
  nc2 = paste(Hair, Eye, sep=',')
)]

# 基于条件新建列
dt[, nc3 := ifelse(Freq >= 10, 1, 0)]
dt[Freq >= 20, nc4 := 2]

# 基于函数新建列
ncols = c('nc', 'nc0') 
dt[, 
   (ncols) := lapply(.SD, function(x) x^0.5 + 1), 
   .SDcols = ncols]

# 删除列
dt[, nc := NULL]
dt[, (c('nc0','nc1','nc2','nc3','nc4')) := NULL]
dt[,!c('C','A')]
# 新建一列
df = df.assign(nc = pd.Series(range(32)))
df.loc[:,'nc0'] = pd.Series(range(32), index=df.index)

# 新建多列
df = df.assign(
  nc1 = pd.Series(range(32)),
  nc2 = df.Hair + ',' + df.Eye
)

# 基于条件新建列
df = df.assign(nc3 = df.Freq.apply(lambda x: 1 if x >= 10 else 0))
df.loc[df.Freq >= 20, 'nc4'] = 2

# 基于函数新建列
ncols = ['nc', 'nc0']
df.loc[:, ncols] = df[ncols].apply(lambda x: x**0.5 + 1)

# 删除列
df = df.drop('nc', axis=1)
df.drop(['nc0','nc1','nc2','nc3','nc4'], axis=1, inplace=True)

列汇总

# 对一列进行计算
dt[, unique(Eye)]
dt[, table(Eye)]

dt[, max(Freq)]

# 对多列进行计算
## 所有列的最大值
dt[, lapply(.SD, max)] 
## 所有列的缺失率
dt[, lapply(.SD, function(x) mean(is.na(x)))] 

## 对部分列计算缺失率,且可扩展到其他函数
sel_cols = c('Hair', 'Sex', 'Freq')
dt[, lapply(.SD, function(x) mean(is.na(x))), .SDcols = sel_cols]
# 基于列的计算
df.Eye.unique()
df.Eye.value_counts()

df.Freq.max()

# 对多列进行计算
## 所有列的最大值
df.max() 
## 所有列的缺失率 
df.isnull().mean() 

## 对部分列计算缺失率,且可扩展到其他函数
sel_cols = ['Hair', 'Sex', 'Freq']
df[sel_cols].apply(lambda x: x.isnull().mean())

分组汇总

# max freq
dt[, .(freq_max = max(Freq)), by = 'Sex']

# count
dt[, .(freq_count = .N), keyby = c('Hair', 'Sex')]
# max freq # pandas的groupby会自动删除缺失变量
df.groupby('Sex').agg({'Freq':'max'}).
  rename(columns={'Freq':'freq_max'}).
  reset_index()

# count
df.groupby(['Hair','Sex']).agg({'Freq':'count'}).
  rename(columns={'Freq':'freq_count'}).
  reset_index()

分组新建列

# 基于sex分组,新建一列等于freq的最大值
dt[, freq_max := max(Freq), by = 'Sex']

# 基于sex分组,选取freq最大的行
dt[order(Freq)][, .SD[.N], by = 'Sex'][]
# 基于sex分组,新建一列等于freq的最大值
df.loc[:,'freq_max'] = df.groupby('Sex')['Freq'].transform(max)

# 基于sex分组,选取freq最大的行
df.sort_values('Freq').groupby('Sex').tail(1)

长宽表转换

# 长表转宽表
dt_w = dcast(dt, Hair+Sex~Eye, value.var = 'Freq', fun.aggregate = sum)

# 宽表转长表
dt_l = melt(dt_w, id = c('Hair','Sex'), variable.name = 'Eye', value.name = 'Freq')
# 长表转宽表
df_w = df.pivot(index=['Hair','Sex'], columns='Eye', values='Freq')

# 宽表转长表
df_l = df_w.melt(id_vars = ['Hair','Sex'], var_name='Freq')

行列合并切割

# 一行切割为多行
dtr = dt[, paste0(Eye, collapse = ','), keyby = c('Hair', 'Sex')]
dtr[, .(Eye = unlist(strsplit(V1, ','))), by = c('Hair', 'Sex')]

# 一列切割为多列
dtc = dt[, .(Hair, eye_sex = paste(Eye, Sex, sep = ','))]
dtc[, c('Eye', 'Sex') := tstrsplit(eye_sex, ',')]
# 一行切割为多行
dfr = df.groupby(['Hair','Sex'])['Eye'].apply(lambda x: ','.join(x)).reset_index()
dfr.assign(Eye = dfr['Eye'].str.split(',')).explode('Eye')

# 一列切割为多列
dfc = df[['Hair']].assign(eye_sex = df.Eye+','+df.Sex)
dfc['Eye'], dfc['Sex']= dfc['eye_sex'].str.split(',', 1).str

数据框合并

# 合并两个数据框
dtr1 = rbind(dt[sample(.N,2)], dt[sample(.N,3)])
# 如果两个数据框的列名不一致,需要添加fill为TRUE
dtr2 = rbind(dt[sample(.N,2)], dt[sample(.N,3), .(Hair)], fill=TRUE)

# 直接合并多个数据框组成的 list
dt_lst = list(
  dt1 = dt[sample(.N,2)], 
  dt2 = dt[sample(.N,3)], 
  dt3 = dt[sample(.N,4)])
dt_bind = rbindlist(dt_lst, idcol = 'dt')
# 合并两个数据框
dfr1 = pd.concat([df.sample(n=2), df.sample(n=3)])
# 如果两个数据框的列名不一致
dfr2 = pd.concat([df.sample(n=2), df.sample(n=3)[['Hair']]], sort=False)

# 直接合并多个数据框组成的 list
df_lst = [
  df.sample(2), 
  df.sample(3), 
  df.sample(4)]
df_con = pd.concat(df_lst, axis=0)