从字典列表到复杂的 excel 结构-解网

问：

我已将数据操作到如下所示的字典列表中：

[
    {
        "Name-A": {
            "Sub Name-A": {
                "Max": 200,
                "date": {
                    "2022-06-06": {
                        "Time Series": 123,
                        "Probability": 0.2,
                        "A": 1,
                        "B": 2
                    },
                    "2022-06-07": {
                        "Time Series": 245,
                        "Probability": 0.4,
                        "A": 4,
                        "B": 5
                    },
                    "2022-06-08": {
                        "Time Series": 789,
                        "Probability": 0.99,
                        "A": 7,
                        "B": 8
                    },
                    "2022-07-19": {
                        "Time Series": 136,
                        "Probability": 0.1,
                        "A": 10,
                        "B": 11
                    }
                }
            }
        }
    },
    {   
        "Name-B": {
            "Sub Name-B1": {
                "Max": 400,
                "date": {
                    "2022-06-06": {
                        "Time Series": 123,
                        "Probability": 0.2,
                        "A": 1,
                        "B": 2
                    },
                    "2022-06-07": {
                        "Time Series": 245,
                        "Probability": 0.4,
                        "A": 4,
                        "B": 5
                    },
                    "2022-06-08": {
                        "Time Series": 789,
                        "Probability": 0.99,
                        "A": 7,
                        "B": 8
                    },
                    "2022-07-19": {
                        "Time Series": 136,
                        "Probability": 0.1,
                        "A": 10,
                        "B": 11
                    }
                }
            },
            "Sub Name-B2": {
                "Max": 800,
                "date": {
                    "2022-06-06": {
                        "Time Series": 123,
                        "Probability": 0.2,
                        "A": 1,
                        "B": 2
                    },
                    "2022-06-07": {
                        "Time Series": 245,
                        "Probability": 0.4,
                        "A": 4,
                        "B": 5
                    },
                    "2022-06-08": {
                        "Time Series": 789,
                        "Probability": 0.99,
                        "A": 7,
                        "B": 8
                    },
                    "2022-07-19": {
                        "Time Series": 136,
                        "Probability": 0.1,
                        "A": 10,
                        "B": 11
                    }
                }
            }
        }
    }
]

这只是虚拟数据，但基本上可以想象超过 10,000+ 行。贯穿始终的类似架构。我需要从这些数据生成一个 excel 工作表，它应该看起来像这样：

我一直在尝试多种解决方案。当我将字典列表转换为 Pandas Dataframe 时，它看起来完全不同。尝试循环提取标头，但这也是一个问题，因为我无法对值进行硬编码，因为数据长度始终是可变的。关于如何实现这一目标的任何想法/指导？任何帮助将不胜感激！目前正在尝试执行以下操作：

from collections import defaultdict

file_path = "test_output.xlsx"


name_to_sub_name = defaultdict(list)
time_series_cols = ['Time Series', 'Probability', 'A', 'B', 'C']

req_chunks = 0
for each_chunk in record:
    for name, all_data in each_chunk.items():
        for sub_name, each_data in all_data.items():
            name_to_sub_name[name].append(sub_name)
            req_chunks += 1

with Workbook(file_path) as workbook:
    first_col = 1
    entity_row = 0
    cp_row = 2
    worksheet_main = workbook.add_worksheet()
    worksheet_main.write(entity_row+1, first_col-1, "date")
    for name, list_of_cps in name_to_sub_name.items():
        for sub_name in list_of_cps:
            worksheet_main.write(entity_row, first_col, name)
            worksheet_main.write(entity_row, first_col+1, sub_name)

            for each_timeseries in time_series_cols:
                worksheet_main.write(entity_row+1, first_col, each_timeseries)
                entity_row = 0
                first_col += 1
            first_col += 1

我设法按预期动态创建列，但不知道如何从我的字典中填充它们。也许必须有一种更简单的方法来做到这一点，但目前，我不确定如何实现这一目标。pandas

python excel 格式设置到 excel 数据操作

/* Writes a full section & returns the maximum column that this section has consumed.
*/
function WriteASection(row,col, data ) {
  r = row;
  c = col;
  name = firstProp(data); // = Name-A
  name_data = data[name];
  subName = firstProp(name_data); // Sub Name-A
  subName_data = name_data[subName];

  Write2Cell(r,c+1,name);
  Write2Cell(r+1,c+1, subName);

  max_cols_added = 0; // track max. cols consumed
  rH = r +  2; // data header starts in this row
  r2 = r + 3; // data rows starts in this row
  max_found = null;

  foreach_keys( F in subName_data ) {
    if(F == "Max") max_found = subName_data[F];
    else { // now comes the listing out each column
      needHeaders = true;
      date_data = subName_data[F];
      foreach_keys(curr_date in date_data) {
        curr_date_data = date_data[curr_date];
        Write2Cell(r2,c,curr_date);
        i = 0;
        foreach_keys( K in curr_date_data) {
          Write2Cell(r2,c+i,curr_date_data[K]);
          if(needHeaders) {
            Write2Cell(rH,c+i,K);
          }
          i++;
        }
        needHeaders = false; // first row rendition does header
        if(i > max_cols_added) {
          max_cols_added = i;
        }
        r2++;
      }
    }
  }
  Write2Cell(rH+1,c+max_cols_added,curr_date_data[K]); 
  max_cols_added++; // we want to add the max

  return max_cols_added;
}

一旦你完成了这两个功能，那么它就简单了

row = 1;
col = 0;
for( data in DataList ) {
  col++;
  row = 1; // every section starts from 1st row 
  col = WriteASection(row, col, data); // note we use the max col returned
}

希望这会有所帮助 👍

2赞 AveragePythonEnjoyer 7/22/2022 #2

你可以使用 python 包来实现这一点：xlsxwriter

处理字典的示例（快速而肮脏）：

import xlsxwriter
from datetime import datetime

workbook = xlsxwriter.Workbook('Test.xlsx')
worksheet = workbook.add_worksheet('Data')

#data = dict provided in question

#write first column
dates = list(list(list(data[0].items())[0][1].items())[0][1]['date'].keys())
dates = [datetime.strptime(d, '%Y-%m-%d') for d in dates]
date_format = workbook.add_format({'num_format': 'dd/mm/yy',
                                     'align': 'left'})

worksheet.write(3, 0,'Date')

for e, date in enumerate(dates):
    worksheet.write_datetime(3+e,0,date,date_format)

#get the different Data column names and save them    
data_names = list(next(iter(list(list(data[0].items())[0][1].items())[0][1]['date'].items()))[1].keys())


#iterate over the dict and write at the necessary time to the cells
col = 1
for d0 in data:
    for k1 in d0.keys():
        d1 = d0[k1]

        for k2 in d1.keys():
            worksheet.write(0,col,k1)
            worksheet.write(1,col,k2)
            
            col_subset = col
            for name in data_names:
                worksheet.write(2,col_subset,name)
                col_subset+=1
            
            d2 = d1[k2]
            worksheet.write(2,col_subset,'C')
            worksheet.write(3,col_subset,d2['Max'])

            row = 3
            for date in d2['date'].keys():
                d3 = d2['date'][date]
                
                col_subset = col
                for name in data_names:
                    worksheet.write(row, col_subset, d3[name])
                    col_subset+=1
                    
                row+=1
        
            col += len(data_names)+3

workbook.close()

0赞 Edoch 7/28/2022 #3

我试图准确地复制您在 excel 屏幕截图中附加的输出，给定示例输入列表（在我的代码中调用）。record

我使用 openpyxl 和 Pandas 编写了这个脚本。

from openpyxl import Workbook
from openpyxl.utils.cell import coordinate_from_string, column_index_from_string
from openpyxl.utils.dataframe import dataframe_to_rows
from pathlib import Path
import pandas as pd

file_path = Path('.')/'test_output.xlsx'

wb = Workbook()

ws1 = wb.active

is_first = True
col_names = 2
col_data_init = 2

for chunk in record:
    for name_0, value_0 in chunk.items():
        for name_1, value_1 in value_0.items():

            # read data
            df = pd.DataFrame(value_1)

            df = pd.concat(
                [df.loc[:,['Max']].reset_index(),pd.json_normalize(df['date'])],
                axis=1
            ).rename({'index':'Date','Max':'C'},axis=1)
            
            # date column
            if is_first: 
                ws1.cell(row=3,column=1).value = 'Date'
                row_date = 4
                for date_value in df['Date'].to_list():
                    ws1.cell(row=row_date,column=1).value = date_value
                    row_date+=1
                is_first = False

            # write chunk names
            ws1.cell(row=1,column=col_names).value = name_0
            ws1.cell(row=2,column=col_names).value = name_1
            col_names+=7

            # write dataframe columns ['Time Series', 'Probability','A','B']
            row_data_id = 3
            col_data_id = col_data_init
            for df_row in dataframe_to_rows(df.loc[:,['Time Series', 'Probability','A','B']],index=False):
                for df_value in df_row:
                    ws1.cell(row=row_data_id,column=col_data_id).value=df_value
                    col_data_id += 1
                row_data_id += 1
                col_data_id = col_data_init

            # write dataframe columns ['C']
            ws1.cell(row=3,column=col_data_init+4).value = 'C'
            ws1.cell(row=4,column=col_data_init+4).value = df['C'].iloc[0]

            col_data_init += 7
            
wb.save(str(file_path))

wb.close()

它输出一个 excel 文件（称为 test_output.xlsx），结构如下

请注意，我做了一些假设：

每个子组中的字典始终以相同的方式构建
每个子组中的日期相同

它应该适用于任何数据长度（即任意数量的日期）和任意数量的组/子组。

上一个：在 pandas 中复制 excel COUNTIF 的输出

下一个：如何在 R 或 Excel 中每 3 行产生一个结果？

从字典列表到复杂的 excel 结构

Transfrom list of dicts to complex excel structure

评论