当数据有数千行时，如何自动高效地按列分组并计算该组中另一列的不同值？-解网

问：

下面是使用 R 生成的一些数据：

library(tidyverse)
set.seed(0)

tibble(A = round(rnorm(10, 20, 2)),
       B = round(rnorm(10, 100, 2))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%  # count distinct A per B
  arrange(B, A) %>% 
  write_csv("small_example.csv")

tibble(A = round(rnorm(5000, 10000, 1000)),
       B = round(rnorm(5000, 50000, 1000))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%
  arrange(B, A) %>% 
  write_csv("big_example.csv")

small_example.csv应该看起来像这样：

我正在尝试使用Excel中的VBA在D列中重新创建C列：

Sub UpdateFormulas()
  Dim last_row As Integer
  last_row = ActiveSheet.UsedRange.Rows.Count
  For i = 2 To last_row
    ActiveSheet.Cells(i, 4).Formula = "=SUMPRODUCT(($B$2:$B$" & last_row & "=B" & _
    i & ")/COUNTIFS($A$2:$A$" & last_row & ", $A$2:$A$" & last_row & ", $B$2:$B$" _
    & last_row & ", $B$2:$B$" & last_row & "))"
  Next i
End Sub

此 VBA 过程完全执行它应该执行的操作，并且无论数据的长度如何，它都可以工作。但是，我使用的公式太慢了，似乎是 O（N^2）操作。如何按需有效地更新这些值？我不反对更简单的方法，但该方法必须是相当自动化的。

Excel VBA

Option Explicit

Sub Demo()
    Dim i As Long, lastRow As Long
    Dim arrData, rngData As Range
    Dim oDicB, oDicAB
    Dim sKeyB As String, sKeyAB As String
    Set oDicB = CreateObject("scripting.dictionary")
    Set oDicAB = CreateObject("scripting.dictionary")
    lastRow = Cells(Rows.Count, "A").End(xlUp).Row
    Set rngData = Range("A2:C" & lastRow)
    arrData = rngData.Value
    For i = LBound(arrData) To UBound(arrData)
        arrData(i, 1) = CStr(arrData(i, 1))
        arrData(i, 2) = CStr(arrData(i, 2))
        sKeyB = arrData(i, 2)
        sKeyAB = arrData(i, 1) & "|" & arrData(i, 2)
        If Not oDicAB.exists(sKeyAB) Then
            oDicAB(sKeyAB) = ""
            If Not oDicB.exists(sKeyB) Then
                oDicB(sKeyB) = 1
            Else
                oDicB(sKeyB) = oDicB(sKeyB) + 1
            End If
        End If
    Next i
    For i = LBound(arrData) To UBound(arrData)
        sKeyB = arrData(i, 2)
        If oDicB.exists(sKeyB) Then
            arrData(i, 3) = oDicB(sKeyB)
        End If
    Next
    rngData.Value = arrData
End Sub

1赞 VBasic2008 11/16/2023 #2

按组运行唯一计数

M365 动态数组公式

=LET(data,A2:B23,uCol,1,gCol,2,
    ud,CHOOSECOLS(data,uCol),
    gd,CHOOSECOLS(data,gCol),
    g,UNIQUE(gd),
XLOOKUP(gd,g,BYROW(g,LAMBDA(r,
    ROWS(UNIQUE(FILTER(ud,gd=r)))))))

使用 VBA 应用

Sub UpdateFormulas()
  
    Const SRC_FIRST_ROW_RANGE As String = "A2:B2"
    Const SRC_UNIQUE_COLUMN As Long = 1
    Const SRC_GROUP_COLUMN As Long = 2
    Const DST_COLUMN As String = "D"
    Const TAB_DEL As String = "    "
    
    Dim ws As Worksheet: Set ws = ActiveSheet ' improve!
    
    Dim srg As Range
    
    With ws.Range(SRC_FIRST_ROW_RANGE)
        Set srg = .Resize(ws.UsedRange.Rows.Count - .Row + 1)
    End With
    
    Dim dcell As Range: Set dcell = srg.Cells(1).EntireRow.Columns(DST_COLUMN)
    
    Dim dFormula2 As String: dFormula2 = "=LET(data," & srg.Address(0, 0) & _
        ",uCol," & SRC_UNIQUE_COLUMN & ",gCol," & SRC_GROUP_COLUMN & "," _
        & vbLf & TAB_DEL & "ud,CHOOSECOLS(data,uCol)," _
        & vbLf & TAB_DEL & "gd,CHOOSECOLS(data,gCol)," _
        & vbLf & TAB_DEL & "g,UNIQUE(gd)," _
        & vbLf & "XLOOKUP(gd,g,BYROW(g,LAMBDA(r," _
        & vbLf & TAB_DEL & "ROWS(UNIQUE(FILTER(ud,gd=r)))))))"
    
    'Debug.Print dFormula2
    
    dcell.Formula2 = dFormula2
  
End Sub

当数据有数千行时，如何自动高效地按列分组并计算该组中另一列的不同值？

How can I automatically and efficiently group by column and count distinct values on another column within that group when data has thousands of rows?

评论

按组运行唯一计数

评论