提问人:user12011 提问时间:7/2/2023 更新时间:7/2/2023 访问量:113
如何使用 Python 将复杂的嵌套 JSON 转换为 csv?
How do I convert a complex nested JSON to csv using Python?
问:
我想使用 Python 将以下嵌套的 JSON 文件转换为 CSV 文件。
{
"page": {
"page": 1,
"pageSize": 250
},
"dataRows": [
{
"entityId": 349255,
"Id": "41432-95P",
"disabled": false,
"followed": false,
"suggestion": false,
"inactive": false,
"pinned": false,
"highlighted": false,
"columnValues": {
"lastName": [
{
"columnValueType": "ENTITY",
"accessStatus": "OK",
"columnValueType": "ENTITY",
"name": "McBrady",
"Id": "41432-95P",
"unpublished": false
}
],
"gender": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Male"
}
],
"hqCity": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Seattle"
}
],
"prefix": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Dr."
}
],
"lastUpdateDate": [
{
"columnValueType": "DATE",
"accessStatus": "OK",
"columnValueType": "DATE",
"expected": false,
"asOfdate": "2023-06-26"
}
],
"companyName": [
{
"columnValueType": "BUSINESS_ENTITY",
"accessStatus": "OK",
"columnValueType": "BUSINESS_ENTITY",
"name": "Global Partnerships",
"Id": "56347-39",
"unpublished": false,
"profileType": "INVESTOR"
}
],
"roles": [
{
"columnValueType": "INT_COLUMN_VALUE",
"accessStatus": "OK",
"columnValueType": "INT_COLUMN_VALUE",
"marked": false,
"value": 3
}
],
"dailyUpdates": [],
"assetClass": [],
"hqCountry": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "United States"
}
],
"latestNoteAuthor": [],
"primaryPosition": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Chair, Enterprise Risk, Compliance and Audit Committee and Member of the Board of Directors"
}
],
"boardSeats": [
{
"columnValueType": "INT_COLUMN_VALUE",
"accessStatus": "OK",
"columnValueType": "INT_COLUMN_VALUE",
"marked": false,
"value": 2
}
],
"fundRoles": [],
"institution": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Harvard University"
},
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "University of Oxford"
}
],
"latestNote": [],
"Id": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "41432-95P"
}
],
"hqRegion": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Americas"
}
],
"email": [],
"dealRoles": [],
"PrimaryCompanyType": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Not-For-Profit Venture Capital"
}
],
"mgtRoles": [],
"hqStateProvince": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Washington"
}
],
"fullName": [
{
"columnValueType": "ENTITY_WITH_NOTE",
"accessStatus": "OK",
"columnValueType": "ENTITY_WITH_NOTE",
"name": "Matthew McBrady Ph.D",
"Id": "41432-95P",
"unpublished": false
}
],
"hqLocation": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Seattle, WA"
}
],
"biography": [
{
"columnValueType": "DESCRIPTION_WITH_SOURCE",
"accessStatus": "OK",
"columnValueType": "DESCRIPTION_WITH_SOURCE",
"value": "Dr. Matthew McBrady serves as Chair, of the Enterprise Risk, Compliance, and Audit Committee.",
"morningstarSource": true
}
],
"firstName": [
{
"columnValueType": "ENTITY",
"accessStatus": "OK",
"columnValueType": "ENTITY",
"name": "Matthew",
"Id": "41432-95P",
"unpublished": false
}
],
"phone": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "+1 (206) 652-8773"
}
],
"hqSubRegion": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "North America"
}
],
"hqAddressLine2": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "Suite 410"
}
],
"hqAddressLine1": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "1201 Western Avenue"
}
],
"hqFax": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "+1 (206) 456-7877"
}
],
"middleName": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "R."
}
],
"companyWebsite": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "www.globalpartnerships.org"
}
],
"hqZipCode": [
{
"columnValueType": "STRING",
"accessStatus": "OK",
"columnValueType": "STRING",
"value": "98101"
}
],
"weeklyUpdates": []
}
}
]
}
使用 chatgpt 迭代后,我得到了以下代码。但是,我无法让它捕获包含 entityId、Id 的第一个嵌套。此外,在随后的每个巢穴中,我都想捕获所有字段;例如,在“lastName”中,我想同时捕获“name”和“Id”,同样在“companyName”中,我希望“name”、“Id”和“profileType”位于单独的列中。正如我向 chatgpt 提到的,我不关心“columnValueType”、“accessStatus”或“unpublished”。
这是 python 代码:
import csv
import json
def extract_field_value(data):
if isinstance(data, dict):
if 'value' in data:
return str(data['value'])
elif 'columnValueType' in data and data['columnValueType'] == 'ENTITY':
return str(data['name'])
else:
values = []
for key, value in data.items():
if key not in ['columnValueType', 'accessStatus', 'unpublished']:
values.append(extract_field_value(value))
return ', '.join(values) if values else ''
elif isinstance(data, list):
values = []
for item in data:
value = extract_field_value(item)
if value:
values.append(value)
return ', '.join(values) if values else ''
else:
return str(data) if data is not None else ''
# Read the JSON data
with open('data.json') as file:
data = json.load(file)
# Extract the nested data rows
data_rows = data['dataRows']
# Extract the column headers from the first data row
column_headers = list(data_rows[0]['columnValues'].keys())
# Create a CSV file
with open('data.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Write the column headers as the first row
writer.writerow(column_headers)
# Write each data row as a separate row in the CSV file
for row in data_rows:
column_values = row['columnValues']
csv_row = []
for column_header in column_headers:
values = column_values.get(column_header, [])
value = extract_field_value(values)
csv_row.append(value)
writer.writerow(csv_row)
print("CSV file created successfully.")
答:
0赞
Zach Young
7/2/2023
#1
我看到你试图编写一个更通用的程序来弄清楚 JSON 的结构。我认为这会更容易,至少在一开始,因为你知道这个结构,只是让你的代码意识到它......要非常明确,在这种情况下,它看起来要简单得多。
此方法还利用了 csv 模块中的 DictWriter 类,因此最后一行包含标题...无需单独跟踪。我也喜欢打字,所以我为我看到的 JSON 结构添加了类型提示(尤其是查看 columnValues)。
import csv
import json
from typing import Any
def extract_columns(col_values: dict[str, list[dict[str, Any]]]) -> dict[str, Any]:
"""
Flatten the dict of columnValues down to a single dict. Each columnValue key becomes the
prefix for the value keys that follow. Assumes that each column value is a single-item item list.
"""
row: dict[str, Any] = {}
for col_name, list_of_vals in col_values.items():
if list_of_vals == []:
continue
col_name = col_name.upper()
vals = list_of_vals[0]
for k, v in vals.items():
if k in ["columnValueType", "accessStatus", "unpublished"]:
continue
row[col_name + "_" + k] = v
return row
def extract_row(data_row: dict[str, Any]) -> dict[str, Any]:
row: dict[str, Any] = {}
for k in ["Id", "disabled", "followed", "suggestion", "inactive", "pinned", "highlighted"]:
row[k] = data_row[k]
row.update(extract_columns(data_row["columnValues"]))
return row
csv_rows: list[dict[str, Any]] = []
with open("input.json") as f:
data = json.load(f)
for data_row in data["dataRows"]:
row = extract_row(data_row)
csv_rows.append(row)
with open("output.csv", "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=csv_rows[0])
writer.writeheader()
writer.writerows(csv_rows)
当我在示例 JSON 上运行它时,第一行最终如下所示:
最终列名称 | 最终价值 |
---|---|
同上 | 41432-95P |
禁用 | 假 |
跟着 | 假 |
建议 | 假 |
无效 | 假 |
寄托 | 假 |
强调 | 假 |
LASTNAME_name | 麦克布雷迪 |
LASTNAME_Id | 41432-95P |
GENDER_value | 雄 |
HQCITY_value | 西雅图 |
PREFIX_value | 博士。 |
LASTUPDATEDATE_expected | 假 |
LASTUPDATEDATE_asOfdate | 2023-06-26 |
COMPANYNAME_name | 全球合作伙伴 |
COMPANYNAME_Id | 56347-39 |
COMPANYNAME_profileType | 投资者 |
ROLES_marked | 假 |
ROLES_value | 3 |
HQCOUNTRY_value | 美国 |
PRIMARYPOSITION_value | 企业风险、合规和审计委员会主席兼董事会成员 |
BOARDSEATS_marked | 假 |
BOARDSEATS_value | 2 |
INSTITUTION_value | 哈佛大学 |
ID_value | 41432-95P |
HQREGION_value | 美洲 |
PRIMARYCOMPANYTYPE_value | 非营利性风险投资 |
HQSTATEPROVINCE_value | 华盛顿 |
FULLNAME_name | 马修·麦克布雷迪博士 |
FULLNAME_Id | 41432-95P |
HQLOCATION_value | 华盛顿州西雅图 |
BIOGRAPHY_value | Matthew McBrady 博士担任企业风险、合规和审计委员会主席。 |
BIOGRAPHY_morningstarSource | 真 |
FIRSTNAME_name | 马修 |
FIRSTNAME_Id | 41432-95P |
PHONE_value | +1 (206) 652-8773 |
HQSUBREGION_value | 北美洲 |
HQADDRESSLINE2_value | 套房 410 |
HQADDRESSLINE1_value | 1201西部大道 |
HQFAX_value | +1 (206) 456-7877 |
MIDDLENAME_value | R. |
COMPANYWEBSITE_value | <www.globalpartnerships.org> |
HQZIPCODE_value | 98101 |
我知道这与您的代码显示的内容不同,但我只想给您一个粗略的想法(方法)。我相信您可以调整代码,特别是extract_columns,以满足您的需求。
0赞
Andrej Kesely
7/2/2023
#2
您可以尝试以下示例,如何将 Json 解析为 DataFrame:
import json
import pandas as pd
with open('your_file.json', 'r') as f_in:
data = json.load(f_in)
out = []
for r in data['dataRows']:
d = {}
for k, v in r['columnValues'].items():
if not v:
d[k] = None
else:
del v[0]['accessStatus']
del v[0]['columnValueType']
for kk, vv in v[0].items():
d[f'{k}.{kk}'] = vv
out.append(d)
df = pd.DataFrame(out)
print(df)
指纹:
lastName.name lastName.Id lastName.unpublished gender.value hqCity.value prefix.value lastUpdateDate.expected lastUpdateDate.asOfdate companyName.name companyName.Id companyName.unpublished companyName.profileType roles.marked roles.value dailyUpdates assetClass hqCountry.value latestNoteAuthor primaryPosition.value boardSeats.marked boardSeats.value fundRoles institution.value latestNote Id.value hqRegion.value email dealRoles PrimaryCompanyType.value mgtRoles hqStateProvince.value fullName.name fullName.Id fullName.unpublished hqLocation.value biography.value biography.morningstarSource firstName.name firstName.Id firstName.unpublished phone.value hqSubRegion.value hqAddressLine2.value hqAddressLine1.value hqFax.value middleName.value companyWebsite.value hqZipCode.value weeklyUpdates
0 McBrady 41432-95P False Male Seattle Dr. False 2023-06-26 Global Partnerships 56347-39 False INVESTOR False 3 None None United States None Chair, Enterprise Risk, Compliance and Audit Committee and Member of the Board of Directors False 2 None Harvard University None 41432-95P Americas None None Not-For-Profit Venture Capital None Washington Matthew McBrady Ph.D 41432-95P False Seattle, WA Dr. Matthew McBrady serves as Chair, of the Enterprise Risk, Compliance, and Audit Committee. True Matthew 41432-95P False +1 (206) 652-8773 North America Suite 410 1201 Western Avenue +1 (206) 456-7877 R. www.globalpartnerships.org 98101 None
评论
dataRows
"lastName"
"name"
"Id"
"hqCountry"