提问人:CubicInfinity 提问时间:11/16/2023 更新时间:11/16/2023 访问量:53
当数据有数千行时,如何自动高效地按列分组并计算该组中另一列的不同值?
How can I automatically and efficiently group by column and count distinct values on another column within that group when data has thousands of rows?
问:
下面是使用 R 生成的一些数据:
library(tidyverse)
set.seed(0)
tibble(A = round(rnorm(10, 20, 2)),
B = round(rnorm(10, 100, 2))) %>%
group_by(B) %>%
mutate(C = n_distinct(A)) %>% # count distinct A per B
arrange(B, A) %>%
write_csv("small_example.csv")
tibble(A = round(rnorm(5000, 10000, 1000)),
B = round(rnorm(5000, 50000, 1000))) %>%
group_by(B) %>%
mutate(C = n_distinct(A)) %>%
arrange(B, A) %>%
write_csv("big_example.csv")
small_example.csv
应该看起来像这样:
我正在尝试使用Excel中的VBA在D列中重新创建C列:
Sub UpdateFormulas()
Dim last_row As Integer
last_row = ActiveSheet.UsedRange.Rows.Count
For i = 2 To last_row
ActiveSheet.Cells(i, 4).Formula = "=SUMPRODUCT(($B$2:$B$" & last_row & "=B" & _
i & ")/COUNTIFS($A$2:$A$" & last_row & ", $A$2:$A$" & last_row & ", $B$2:$B$" _
& last_row & ", $B$2:$B$" & last_row & "))"
Next i
End Sub
此 VBA 过程完全执行它应该执行的操作,并且无论数据的长度如何,它都可以工作。但是,我使用的公式太慢了,似乎是 O(N^2) 操作。如何按需有效地更新这些值?我不反对更简单的方法,但该方法必须是相当自动化的。
答:
1赞
taller
11/16/2023
#1
- 使用提高效率
Dictionary
- 1000 行样本数据需要不到 1 秒的时间
Option Explicit
Sub Demo()
Dim i As Long, lastRow As Long
Dim arrData, rngData As Range
Dim oDicB, oDicAB
Dim sKeyB As String, sKeyAB As String
Set oDicB = CreateObject("scripting.dictionary")
Set oDicAB = CreateObject("scripting.dictionary")
lastRow = Cells(Rows.Count, "A").End(xlUp).Row
Set rngData = Range("A2:C" & lastRow)
arrData = rngData.Value
For i = LBound(arrData) To UBound(arrData)
arrData(i, 1) = CStr(arrData(i, 1))
arrData(i, 2) = CStr(arrData(i, 2))
sKeyB = arrData(i, 2)
sKeyAB = arrData(i, 1) & "|" & arrData(i, 2)
If Not oDicAB.exists(sKeyAB) Then
oDicAB(sKeyAB) = ""
If Not oDicB.exists(sKeyB) Then
oDicB(sKeyB) = 1
Else
oDicB(sKeyB) = oDicB(sKeyB) + 1
End If
End If
Next i
For i = LBound(arrData) To UBound(arrData)
sKeyB = arrData(i, 2)
If oDicB.exists(sKeyB) Then
arrData(i, 3) = oDicB(sKeyB)
End If
Next
rngData.Value = arrData
End Sub
1赞
VBasic2008
11/16/2023
#2
按组运行唯一计数
M365 动态数组公式
=LET(data,A2:B23,uCol,1,gCol,2,
ud,CHOOSECOLS(data,uCol),
gd,CHOOSECOLS(data,gCol),
g,UNIQUE(gd),
XLOOKUP(gd,g,BYROW(g,LAMBDA(r,
ROWS(UNIQUE(FILTER(ud,gd=r)))))))
使用 VBA 应用
Sub UpdateFormulas()
Const SRC_FIRST_ROW_RANGE As String = "A2:B2"
Const SRC_UNIQUE_COLUMN As Long = 1
Const SRC_GROUP_COLUMN As Long = 2
Const DST_COLUMN As String = "D"
Const TAB_DEL As String = " "
Dim ws As Worksheet: Set ws = ActiveSheet ' improve!
Dim srg As Range
With ws.Range(SRC_FIRST_ROW_RANGE)
Set srg = .Resize(ws.UsedRange.Rows.Count - .Row + 1)
End With
Dim dcell As Range: Set dcell = srg.Cells(1).EntireRow.Columns(DST_COLUMN)
Dim dFormula2 As String: dFormula2 = "=LET(data," & srg.Address(0, 0) & _
",uCol," & SRC_UNIQUE_COLUMN & ",gCol," & SRC_GROUP_COLUMN & "," _
& vbLf & TAB_DEL & "ud,CHOOSECOLS(data,uCol)," _
& vbLf & TAB_DEL & "gd,CHOOSECOLS(data,gCol)," _
& vbLf & TAB_DEL & "g,UNIQUE(gd)," _
& vbLf & "XLOOKUP(gd,g,BYROW(g,LAMBDA(r," _
& vbLf & TAB_DEL & "ROWS(UNIQUE(FILTER(ud,gd=r)))))))"
'Debug.Print dFormula2
dcell.Formula2 = dFormula2
End Sub
评论
0赞
CubicInfinity
11/18/2023
对于来到这里的其他人来说,这个解决方案已经足够好了。Taller 的解决方案更快。
评论
=BYROW(DROP(FILTER(B:B,B:B<>""),1),LAMBDA(z,COUNT(UNIQUE(FILTER(A:A,B:B=z)))))