当数据有数千行时,如何自动高效地按列分组并计算该组中另一列的不同值?

How can I automatically and efficiently group by column and count distinct values on another column within that group when data has thousands of rows?

提问人:CubicInfinity 提问时间:11/16/2023 更新时间:11/16/2023 访问量:53

问:

下面是使用 R 生成的一些数据:

library(tidyverse)
set.seed(0)

tibble(A = round(rnorm(10, 20, 2)),
       B = round(rnorm(10, 100, 2))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%  # count distinct A per B
  arrange(B, A) %>% 
  write_csv("small_example.csv")

tibble(A = round(rnorm(5000, 10000, 1000)),
       B = round(rnorm(5000, 50000, 1000))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%
  arrange(B, A) %>% 
  write_csv("big_example.csv")

small_example.csv应该看起来像这样:

small_example.csv

我正在尝试使用Excel中的VBA在D列中重新创建C列:

Sub UpdateFormulas()
  Dim last_row As Integer
  last_row = ActiveSheet.UsedRange.Rows.Count
  For i = 2 To last_row
    ActiveSheet.Cells(i, 4).Formula = "=SUMPRODUCT(($B$2:$B$" & last_row & "=B" & _
    i & ")/COUNTIFS($A$2:$A$" & last_row & ", $A$2:$A$" & last_row & ", $B$2:$B$" _
    & last_row & ", $B$2:$B$" & last_row & "))"
  Next i
End Sub

此 VBA 过程完全执行它应该执行的操作,并且无论数据的长度如何,它都可以工作。但是,我使用的公式太慢了,似乎是 O(N^2) 操作。如何按需有效地更新这些值?我不反对更简单的方法,但该方法必须是相当自动化的。

Excel VBA

评论

1赞 Scott Craner 11/16/2023
=BYROW(DROP(FILTER(B:B,B:B<>""),1),LAMBDA(z,COUNT(UNIQUE(FILTER(A:A,B:B=z)))))
0赞 CubicInfinity 11/16/2023
@ScottCraner 它仍然很慢,但很酷。
1赞 InjuredCoding 11/16/2023
您希望这是excel中的公式,还是可以在vba中计算c列?如果您在 vba 中执行此操作,然后输入值,它可能会更快,并且不需要不断重新计算
0赞 CubicInfinity 11/18/2023
@InjuredCoding 单元格不需要使用公式。

答:

1赞 taller 11/16/2023 #1
  • 使用提高效率Dictionary
  • 1000 行样本数据需要不到 1 秒的时间
Option Explicit

Sub Demo()
    Dim i As Long, lastRow As Long
    Dim arrData, rngData As Range
    Dim oDicB, oDicAB
    Dim sKeyB As String, sKeyAB As String
    Set oDicB = CreateObject("scripting.dictionary")
    Set oDicAB = CreateObject("scripting.dictionary")
    lastRow = Cells(Rows.Count, "A").End(xlUp).Row
    Set rngData = Range("A2:C" & lastRow)
    arrData = rngData.Value
    For i = LBound(arrData) To UBound(arrData)
        arrData(i, 1) = CStr(arrData(i, 1))
        arrData(i, 2) = CStr(arrData(i, 2))
        sKeyB = arrData(i, 2)
        sKeyAB = arrData(i, 1) & "|" & arrData(i, 2)
        If Not oDicAB.exists(sKeyAB) Then
            oDicAB(sKeyAB) = ""
            If Not oDicB.exists(sKeyB) Then
                oDicB(sKeyB) = 1
            Else
                oDicB(sKeyB) = oDicB(sKeyB) + 1
            End If
        End If
    Next i
    For i = LBound(arrData) To UBound(arrData)
        sKeyB = arrData(i, 2)
        If oDicB.exists(sKeyB) Then
            arrData(i, 3) = oDicB(sKeyB)
        End If
    Next
    rngData.Value = arrData
End Sub

1赞 VBasic2008 11/16/2023 #2

按组运行唯一计数

M365 动态数组公式

=LET(data,A2:B23,uCol,1,gCol,2,
    ud,CHOOSECOLS(data,uCol),
    gd,CHOOSECOLS(data,gCol),
    g,UNIQUE(gd),
XLOOKUP(gd,g,BYROW(g,LAMBDA(r,
    ROWS(UNIQUE(FILTER(ud,gd=r)))))))

enter image description here

使用 VBA 应用

Sub UpdateFormulas()
  
    Const SRC_FIRST_ROW_RANGE As String = "A2:B2"
    Const SRC_UNIQUE_COLUMN As Long = 1
    Const SRC_GROUP_COLUMN As Long = 2
    Const DST_COLUMN As String = "D"
    Const TAB_DEL As String = "    "
    
    Dim ws As Worksheet: Set ws = ActiveSheet ' improve!
    
    Dim srg As Range
    
    With ws.Range(SRC_FIRST_ROW_RANGE)
        Set srg = .Resize(ws.UsedRange.Rows.Count - .Row + 1)
    End With
    
    Dim dcell As Range: Set dcell = srg.Cells(1).EntireRow.Columns(DST_COLUMN)
    
    Dim dFormula2 As String: dFormula2 = "=LET(data," & srg.Address(0, 0) & _
        ",uCol," & SRC_UNIQUE_COLUMN & ",gCol," & SRC_GROUP_COLUMN & "," _
        & vbLf & TAB_DEL & "ud,CHOOSECOLS(data,uCol)," _
        & vbLf & TAB_DEL & "gd,CHOOSECOLS(data,gCol)," _
        & vbLf & TAB_DEL & "g,UNIQUE(gd)," _
        & vbLf & "XLOOKUP(gd,g,BYROW(g,LAMBDA(r," _
        & vbLf & TAB_DEL & "ROWS(UNIQUE(FILTER(ud,gd=r)))))))"
    
    'Debug.Print dFormula2
    
    dcell.Formula2 = dFormula2
  
End Sub

评论

0赞 CubicInfinity 11/18/2023
对于来到这里的其他人来说,这个解决方案已经足够好了。Taller 的解决方案更快。