提问人:Taliesin 提问时间:11/6/2023 最后编辑:Taliesin 更新时间:11/7/2023 访问量:171
在 Python 中将 XML 解析为 CSV
Parsing XML to CSV in Python
问:
我正在尝试编写一个 Python 脚本来解析 XML 文件并为 XML 中的每个表生成 CSV 文件。每个表都应包含其属性。此外,我想创建一个 CSV 文件来表示这些表之间的关系。
这是我的代码:
import xml.etree.ElementTree as ET
import csv
def extract_tables_and_attributes(xml_file):
parser = ET.XMLParser(encoding="windows-1252")
tree = ET.parse(xml_file, parser=parser)
root = tree.getroot()
tables = root.findall(".//{http://www.omg.org/spec/UML/20090901}Class")
table_data = []
for table in tables:
table_name = table.find("{http://www.omg.org/spec/UML/20090901}name").text
attributes = table.findall(".//{http://www.omg.org/spec/UML/20090901}Property")
table_attributes = []
for attr in attributes:
attr_name = attr.find("{http://www.omg.org/spec/UML/20090901}name").text
attr_type = attr.find("{http://www.omg.org/spec/UML/20090901}type").text
table_attributes.append([attr_name, attr_type])
table_data.append((table_name, table_attributes))
return table_data
def export_to_csv(table_data):
for table_name, attributes in table_data:
csv_file_name = f"{table_name}.csv"
with open(csv_file_name, 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Attribute Name', 'Attribute Type'])
for attr_name, attr_type in attributes:
csv_writer.writerow([attr_name, attr_type])
if __name__ == "__main__":
xml_file = r"Gemeentelijk Gegevensmodel XMI2.1.2.xml"
table_data = extract_tables_and_attributes(xml_file)
export_to_csv(table_data)
但是,我遇到以下错误:
Traceback (most recent call last):
File "ggm to csv.py", line 39, in <module>
table_data = extract_tables_and_attributes(xml_file)
File "ggm to csv.py", line 6, in extract_tables_and_attributes
tree = ET.parse(xml_file, parser=parser)
File "xml\etree\ElementTree.py", line 1203, in parse
tree.parse(source, parser)
File "xml\etree\ElementTree.py", line 571, in parse
parser.feed(data)
xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 71765, column 176
'正在使用以下 XML,并且似乎没有任何关于格式正确的问题。如果有人能帮我解决这个问题,那就太好了。
我尝试使用从Enterprise Architect到xml(uml)的不同导出来运行脚本,具有多个版本,但是我在同一行上一直遇到相同的问题。
编辑:
期望的结果如下所示(类型类的表(“PAND”)及其类型属性的属性。(这是一个表,但我希望许多 CSV 都有自己的表):
属性名称,属性类型Bruto inhoud pand,EAJava_N6基准开始 geldigheid pand,EAJava_DATUM Datum einde geldigheid pand,EAJava_DATUM GeometriePunt,EAJava_GM_Point Hoogste bouwlaag pand,EAJava_N3 Identificatie BGTPND,EAJava_NEN3610ID Ind planobject,EAJava_INDIC Indicatie geconstateerd pand,EAJava_INDIC Inwinning geometrie bovenaanzicht,EAJava_GM_Object Inwinning geometrie maaiveld,EAJava_GM_Object Laagste bouwlaag pand,EAJava_N3标签 nummeraanduidingreeks,EAJava_C74E1553_32AE_4fd8_9796_00C6E1C51A11 Lod1 geometrie pand,EAJava_GM_Object Lod2 geometrie pand,EAJava_GM_Object Lod3 geometrie pand,EAJava_GM_Object Oorspronkelijk bouwjaar pand,EAJava_JAAR Oppervlakte pand,EAJava_N6 Pandgeometrie bovenaanzicht,EAJava_GM_Surface Pandgeometrie Maaiveld,EAJava_GM_MultiSurface Pandidentificatie,EAJava_AB8B30D0_FD1F_4c44_9396_BB05389EA20B Pandstatus,EAJava_E2CC5DFC_C264_4c21_8E47_F551958E1C17 Relatieve hoogteligging pand,EAJava_N2 Status voortgang Bouw,EAJava_8C49F097_6D95_4406_B3B7_58AC102B6FD2
答:
我按照 python 文档处理 Unicode。我首先检查了编码。XML声明似乎不正确:errors="surrogateescape"
charset_normalizer
import xml.etree.ElementTree as ET
from charset_normalizer import detect
import csv
filename = 'Gemeentelijk Gegevensmodel XMI2.1.2.xml'
with open(filename, 'rb') as f:
xml = f.read()
result = detect(xml)
print(result)
for k, v in result.items():
if k == "confidence":
en_conf = v
print(k, v)
if k == "encoding":
en_cod = v
print(k, v)
if en_conf == 1.0:
with open(filename, 'r', encoding=en_cod, errors="surrogateescape") as f:
data = f.read()
# remove wrong declaration
data1 = data.replace("<?xml version='1.0' encoding='windows-1252' ?>\n","")
with open('NewXML_'+filename, 'w', encoding="utf-8", errors="surrogateescape") as f:
# add new declaration
f.write("<?xml version='1.0' encoding='UTF-8' ?>\n")
f.write(data1)
tree = ET.parse('NewXML_'+filename)
root = tree.getroot()
ns = dict([node for _, node in ET.iterparse('NewXML_'+filename, events=['start-ns'])])
#print(ns)
table_data = []
packagedElements = root.findall('.//packagedElement[@xmi:type="uml:Class"]', ns)
print(len(packagedElements))
for elem in packagedElements:
if elem.find(".//ownedAttribute") is not None and elem.find(".//ownedAttribute").get("name") is not None:
name = elem.find(".//ownedAttribute").get("name")
if elem.find(".//type") is not None and elem.find(".//type").get("{http://schema.omg.org/spec/XMI/2.1}idref") is not None:
ty = elem.find(".//type").get("{http://schema.omg.org/spec/XMI/2.1}idref")
row = name, ty
table_data.append(row)
print(table_data)
print(len(table_data))
with open("table.csv", "w", encoding='utf-8', newline="\n") as csv_file:
c_writer = csv.writer(csv_file, delimiter=';', quoting=csv.QUOTE_MINIMAL)
for row in table_data:
c_writer.writerow(row)
输出:
{'encoding': 'cp775', 'language': 'English', 'confidence': 1.0}
encoding cp775
confidence 1.0
742
[('contactsoort', 'EAJava_AN80'), ('aantal kamers', 'EAJava_int'), ('einddatum', 'EAJava_Date'), ('winkelformule', 'EAJava_AN80'), ('aantalFulletimeVrouwen', 'EAJava_int'), ('aantalkassa', 'EAJava_int'), ('aanvangAanwezigheid', 'EAJava_DateTime'), ('nummer', 'EAJava_int'), ('naam', 'EAJava_AN250'), ('aanstellingsdatum', 'EAJava_Date'), ('naam', 'EAJava_AN250'), ('naam', 'EAJava_AN250'), ('naam', 'EAJava_AN250'), ('naam', 'EAJava_AN250'), ('aanstellingsdatum', 'EAJava_Date'), ('besloten', 'EAJava_Boolean'), ... shorted ]
742
评论
recover=True
将文件从 18MB 截断到 5MB 左右,因此丢失了很多内容
sed -re 's/“|”/\"/g' file.xml
我试图分析XML并查看哪些行(单词)的字节超过127(\x7F)(“高位”):
import csv
with (
open("input.xml", "rb") as f_in,
open("output.csv", "w", newline="") as f_out,
):
writer = csv.writer(f_out)
writer.writerow(["Line #", "Words w/high bits"])
i, line = int, bytes
for i, line in enumerate(f_in, start=1):
line = line.replace(b"\x09", b"").replace(b"\x0a", b"") # drop tabs and newlines
words: list[bytes] = []
for word in line.split(b" "):
for x in word:
if x > 127:
words.append(word)
break
if words != []:
writer.writerow([i, words])
我在这两行中发现了“Plaatsoriëntatie”一词的不一致编码:
Line #, Words w/high bits
...
7638, "[b'name=""Plaatsori\xc3\xabntatie"">']"
7641, "[b'xmi:idref=""EAJava_enum_Plaatsori\xc3_ntatie""/>']"
...
- 第一行正确地将“ë”编码为 UTF-8 ()。
\xc3\xab
- 第二行看起来曾经被正确地 UTF-8 编码,然后(在 ISO-8859-1 和 Windows-1252 中为“«”)看起来已被替换为“_”。
\xab
我有一种预感,源具有格式错误的编码(例如,);如果你能深入了解它,我很想了解它的真相。祝你好运。\xc3_
评论
utf-8
U+201D
U+201D
uml:Class
//{http://www.omg.org/spec/UML/20090901}Class