提问人:Leo 提问时间:11/16/2023 更新时间:11/17/2023 访问量:42
groupby.rolling 比 with for 循环慢
groupby.rolling slower than with for loop
问:
我想在大型数据集上通过滚动平均值来执行 groupformr。
使用大型 df,执行 groupby.rolling 似乎比使用 for 循环慢:
这
grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)
似乎比:
for key, grp in grouped:
grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
为什么会这样?以及我可以使用什么方法来加速 Groupby 在大型 DF 上的滚动平均值。
import matplotlib.pyplot as plt
import time
import numpy as np
import pandas as pd
# Define different sizes to test
sizes = [10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000, 500000]
window_size = 10
# Lists to store execution times for each method
times_groupby_1 = []
times_groupby_2 = []
for size in sizes:
# Create a DataFrame for each size
data = {'value': np.random.rand(size), 'group': np.random.choice(['A', 'B', 'C', 'D'], size)}
sample_df = pd.DataFrame(data)
grouped = sample_df.groupby('group')
# Method 1: Using groupby with rolling mean (for loop method)
start_time = time.time()
rolling_mean_df = pd.DataFrame()
for key, grp in grouped:
grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
end_time = time.time()
times_groupby_1.append(end_time - start_time)
# Method 2: Using groupby with rolling mean (direct method)
start_time = time.time()
sample_df['rolling_mean'] = grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)
end_time = time.time()
times_groupby_2.append(end_time - start_time)
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(sizes, times_groupby_1, label='Method 1 (for loop)', marker='o')
plt.plot(sizes, times_groupby_2, label='Method 2 (direct)', marker='o')
plt.xlabel('DataFrame Size')
plt.ylabel('Time (seconds)')
plt.title('Comparison of Execution Times for Rolling Mean Calculation')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.grid(True)
plt.show()
答:
0赞
Zihao
11/17/2023
#1
如果“group”键的基数不大,Groupby 可能不是瓶颈。
并且 mean() 函数在某些版本的 pandas 之后得到了高度优化(可能是 >=1.3.0)。
1赞
Andrej Kesely
11/17/2023
#2
这完全取决于数据的特征。如果您只有几个组和许多值(如您的示例所示),则标准将非常快(高度优化)。.rolling()
.mean()
但是,如果您的数据包含许多组,则可以尝试使用自定义解决方案(以消除连接数据帧等的开销)。例如,您可以使用 numba:
from numba import float64, int64, njit
from numba.typed import Dict, List
from numba.types import ListType
list_type_float64 = ListType(float64)
list_type_int64 = ListType(int64)
@njit
def _mean(lst):
s = 0
for v in lst:
s = s + v
return s / max(len(lst), 1)
@njit
def _get_rolling_mean_numba(index, group, value, size):
groups = Dict.empty(int64, list_type_float64)
indices = Dict.empty(int64, list_type_int64)
out = np.empty_like(value)
for i, g, v in zip(index, group, value):
groups.setdefault(g, List.empty_list(float64)).append(v)
indices.setdefault(g, List.empty_list(int64)).append(i)
cnt = 0
for k, vals in groups.items():
for i in range(0, len(vals)):
out[cnt] = _mean(vals[max(0, i - size) : i + 1])
cnt += 1
return out, [i for ind in indices.values() for i in ind]
def get_rolling_mean_numba(df, size):
v, i = _get_rolling_mean_numba(
df.index.to_numpy(),
df.group.to_numpy(),
df.value.to_numpy(),
size,
)
df["rolling_mean"] = pd.Series(v, index=i)
return df
基准测试使用(其中组数为总值的 ~1/3):perfplot
from statistics import mean
import numpy as np
import pandas as pd
import perfplot
from matplotlib import pyplot as plt
from numba import float64, int64, njit
from numba.typed import Dict, List
from numba.types import ListType
plt.rcParams["figure.autolayout"] = True
list_type_float64 = ListType(float64)
list_type_int64 = ListType(int64)
def get_sample_df(size=10):
np.random.seed(42)
sample_df = pd.DataFrame(
{
"value": np.random.rand(size),
"group": np.random.choice(range(size // 3), size),
}
)
return sample_df
@njit
def _mean(lst):
s = 0
for v in lst:
s = s + v
return s / max(len(lst), 1)
@njit
def _get_rolling_mean_numba(index, group, value, size):
groups = Dict.empty(int64, list_type_float64)
indices = Dict.empty(int64, list_type_int64)
out = np.empty_like(value)
for i, g, v in zip(index, group, value):
groups.setdefault(g, List.empty_list(float64)).append(v)
indices.setdefault(g, List.empty_list(int64)).append(i)
cnt = 0
for k, vals in groups.items():
for i in range(0, len(vals)):
out[cnt] = _mean(vals[max(0, i - size) : i + 1])
cnt += 1
return out, [i for ind in indices.values() for i in ind]
def get_rolling_mean_numba(df, size):
v, i = _get_rolling_mean_numba(
df.index.to_numpy(),
df.group.to_numpy(),
df.value.to_numpy(),
size,
)
df["rolling_mean"] = pd.Series(v, index=i)
return df
def get_rolling_mean_original(df, size):
grouped = df.groupby("group")
df["rolling_mean"] = (
grouped["value"]
.rolling(window=size, min_periods=0)
.mean()
.reset_index(level=0, drop=True)
)
return df
def get_rolling_mean_original_2(df, size):
grouped = df.groupby("group")
rolling_mean_df = pd.DataFrame()
for key, grp in grouped:
grp["rolling_mean"] = grp["value"].rolling(window=size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
return rolling_mean_df
def get_rolling_mean_standard(df, size):
out, groups, indices = [], {}, {}
for i, g, v in zip(df.index, df["group"], df["value"]):
groups.setdefault(g, []).append(v)
indices.setdefault(g, []).append(i)
for k, vals in groups.items():
groups[k] = [mean(vals[max(0, i - size) : i + 1]) for i in range(0, len(vals))]
df["rolling_mean"] = pd.Series(
[v for vals in groups.values() for v in vals],
index=[i for indices in indices.values() for i in indices],
)
return df
print(get_rolling_mean_original(get_sample_df(), 10).sort_values(by=["group", "value"]))
print(
get_rolling_mean_original_2(get_sample_df(), 10).sort_values(by=["group", "value"])
)
print(get_rolling_mean_standard(get_sample_df(), 10).sort_values(by=["group", "value"]))
# this compiles the numba version:
print(get_rolling_mean_numba(get_sample_df(), 10).sort_values(by=["group", "value"]))
perfplot.show(
setup=lambda size: pd.DataFrame(
{
"value": np.random.rand(size),
"group": np.random.choice(range(size // 3), size),
}
),
kernels=[
lambda df: get_rolling_mean_original(df, 10),
lambda df: get_rolling_mean_original_2(df, 10),
lambda df: get_rolling_mean_standard(df, 10),
lambda df: get_rolling_mean_numba(df, 10),
],
labels=["original 1", "original 2", "plain python", "numba"],
n_range=[10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000],
xlabel="N",
logx=True,
logy=True,
equality_check=None,
)
结果:
评论