我需要使用 numpy 的矢量化来优化我的双精度 for 循环

I need to use numpy's vectorization to optimize my double for loop

提问人:jaried 提问时间:3/19/2021 最后编辑:Jérôme Richardjaried 更新时间:4/20/2021 访问量:153

问:

我有一个带有嵌套 for 循环的 python 函数,该循环被调用了数千次,而且速度太慢。从我在网上读到的内容来看,应该有一种方法可以使用 numpy 矢量化来优化它,以便迭代是用更快的 C 代码而不是 python 完成的。但是,我以前从未使用过 numpy,我无法弄清楚。

函数为:从第一行开始先切片,然后计算s列1行从后到前的总和,如果大于或等于n,则返回总数;然后从前两行切片,然后计算S列1行从后到前的总和,如果小于n,则计算S列2行的总和,如果大于等于n,则返回行数;依此类推,直到所有行都循环。

原始代码(python 3.8)是:

import pandas as pd
from time import time
import numpy as np

def sumbars(df, s, n):
    start = time()
    df2 = df[s].copy()
    # df2.loc[0]=1000
    df['sumbars'] = 0
    for i in range(len(df)):
        df3 = df2[:i + 1]
        l = len(df3)
        for j in range(l):
            # if i>=8:
            #     print()
            df4 = df3[l - j - 1:]
            if df4.sum() >= n:
                df.loc[i, 'sumbars'] = j + 1
                # df5=df[['trade_date',s,'sumbars']]
                break

    stop = time()
    global sumbarstime
    sumbarstime = sumbarstime + stop - start
    return df['sumbars']

def main():
    # df=pd.read_csv('todo.csv')
    df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
    df['jcts2'] = sumbars(df,'jc',2)

    
    print('')
    df.to_csv("result.csv",index=False,sep=',')

    # print sec

    return

main()

我需要优化双精度 for 循环是:

for i in range(len(df)):
        df3 = df2[:i + 1]
        l = len(df3)
        for j in range(l):
            # if i>=8:
            #     print()
            df4 = df3[l - j - 1:]
            if df4.sum() >= n:
                df.loc[i, 'sumbars'] = j + 1
                # df5=df[['trade_date',s,'sumbars']]
                break

有人帮我优化了一次代码,我需要优化一次最后的for循环。 优化 for 循环后:

import pandas as pd
from time import time
import numpy as np


def sumbars(df, s, n, ):
    start = time()
    sdata = np.array(df[s])
    sdata = np.flipud(sdata)
    l = len(sdata)
    sumbars = np.zeros(l)
    for i in range(l):
        n1 = sdata[i:]
        cumsum = np.cumsum(n1)
        k = np.argmax(cumsum >= n)
        n2 = cumsum[k]
        if (k != 0) | ((n2 != 0) & (n == 1)):
            sumbars[l - i - 1] = k + 1
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    stop = time()
    global sumbarstime
    sumbarstime += stop - start
    # df.to_csv("my.csv")
    # print(df['sumbars'])
    return df['sumbars']

def main():
    # df=pd.read_csv('todo.csv')
    df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
    df['jcts2'] = sumbars(df,'jc',2)

    
    print('')
    df.to_csv("result.csv",index=False,sep=',')

    # print sec

    return

main()

帮助:我需要用numpy矢量化优化最后一个for循环。除了使用 cumsum,还有其他方法可以优化我原来的 double for 循环吗?

    for i in range(l):
        n1 = sdata[i:]
        cumsum = np.cumsum(n1)
        k = np.argmax(cumsum >= n)
        n2 = cumsum[k]
        if (k != 0) | ((n2 != 0) & (n == 1)):
            sumbars[l - i - 1] = k + 1

在指出我的问题后,请给我更多时间回复。

Python Pandas 性能 数据帧 numpy

评论

2赞 hpaulj 3/20/2021
并非每个计算都是“可矢量化的”。 矢量化意味着使用编译的 numpy 方法。这些方法中的大多数本质上都是“并行”的,对整个数组进行操作,没有任何隐含的顺序或序列(尽管代码会迭代)。 是一个例外,一个本质上是顺序的函数。你的使用表明问题本质上是连续的,可能需要跳出框框思考来“矢量化”(如果可能的话)。numpycnp.cumsumcumsum
0赞 jaried 3/20/2021
我添加了原始代码。除了使用 cumsum,还有其他方法可以优化我的双精度 for 循环吗?
2赞 Lith 3/20/2021
您可以随时尝试 numba 库,它应该可以很好地与 numpy 函数配合使用。
1赞 Jérôme Richard 3/20/2021
输入值总是正数吗?df[s]
0赞 jaried 3/20/2021
df[s] 的输入值始终为正数

答:

4赞 Jérôme Richard 3/20/2021 #1

假设 df[s] 的输入值始终为正值,则可以显著改进算法。事实上,当前实现的复杂度是 O(n**2),而在这种情况下,可以编写复杂度为 O(n log n) 的算法(甚至可能是 )。O(n)

改进的解决方案包括仅计算一次并执行增量更新。 实际上,不需要重新计算它,因为 总是等于 with 和 。 是独立的,因此可以预先计算。 可以在循环中以增量方式计算。 可以使用二进制搜索找到,因为值是排序的。cumsumnp.cumsum(arr[i:])fullCumsum[i:]-mfullCumsum = np.cumsum(arr)m = np.sum(arr[:i])fullCumsumimk

下面是生成的(纯 Python)实现:

def sumbarsFast(df, s, n, ):
    sdata = np.array(df[s])
    sdata = np.flipud(sdata)
    l = len(sdata)
    sumbars = np.zeros(l)
    fullCumsum = np.cumsum(sdata)
    m = 0
    for i in range(l):
        k = np.searchsorted(fullCumsum[i:], n + m)
        if k == len(fullCumsum)-i:
            k = 0
        if (k != 0) or ((fullCumsum[i+k] != m) and (n == 1)):
            sumbars[l - i - 1] = k + 1
        m += sdata[i]
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    return df['sumbars']

使用 Numba JIT 和并行性可以更快:

from numba import njit, prange, int64

@njit(int64[:](int64[:], int64), parallel=True)
def sumbarsKernel(sdata, n):
    l = len(sdata)
    fullCumsum = np.cumsum(sdata)
    sumbars = np.zeros(l, dtype=np.int64)
    nShared = np.int64(n)
    for i in prange(l):
        m = fullCumsum[i-1] if i > 0 else 0
        k = np.searchsorted(fullCumsum[i:], n + m)
        if k == len(fullCumsum)-i:
            k = 0
        if (k != 0) or ((fullCumsum[i+k] != m) and (nShared == 1)):
            sumbars[l - i - 1] = k + 1
    return sumbars

def sumbarsSuperFast(df, s, n, ):
    sdata = np.array(df[s], dtype=np.int64)
    sdata = np.flipud(sdata)
    sumbars = sumbarsKernel(sdata, n)
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    return df['sumbars']

以下是我的机器上包含 200000 行的数据帧的计时:

Your naive version:         589.764 seconds
Your "optimized" version:   121.683 seconds
sumbarsFast (pure Python):    1.081 second
sumbarsSuperFast (Numba):     0.004 second

最终实现比朴素版本快约 150000 倍,比“优化”版本快 30000 倍

评论

0赞 jaried 3/20/2021
看来效果超级好,我现在就去调试代码。
0赞 jaried 3/20/2021
如果我使用我发布的数据帧,我注释掉了 JIT 行,并且 sumbarsSuperFast 的结果是正确的。如果不注释,则结果是错误的。为什么?
1赞 Jérôme Richard 3/20/2021
事实上,问题来自Numba代码中的并行性。它看起来像是 Numba 中的一个错误prange 的限制(与 Python 对象有关)......我通过将 Python 整数显式复制到安全共享变量中来解决此问题。现在,所有实现之间的结果都相同,并且性能仍然相同:)。nnp.int64