使用 python 复制 XML 元素并将其添加到 XML 文件中的特定位置

Duplicating an XML element and adding it to a specific position in XML file using python

提问人:Yash Pisat 提问时间:9/29/2023 更新时间:9/29/2023 访问量:42

问:

我有一个xml文件,其中的内容如下所示:

xml_content_to_search =

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        </available_substances>
        </Document>

我想在 xml 文件中搜索特定的物质 id,然后复制它并进行一些操作,我能够实现它。 但是在复制之后,我想将复制的元素插入到复制它的物质 ID 的正下方。

这是我的代码:

# Use the os.listdir() method to list all files in the specified folder and filter for XML files
for filename in os.listdir(IAC_files_path):
    if filename.endswith(".xml"):
        # Remove the ".xml" extension before adding to the list
        xml_file_names.append(os.path.splitext(filename)[0])

# Parse the XML content to search for <substance> elements with matching IDs
tree = ET.ElementTree(ET.fromstring(xml_content_to_search))
root = tree.getroot()

# Initialize a flag to check if at least one match is found
match_found = False

# Create a list to store duplicated <substance> elements
duplicated_substance_elements = []

# Iterate through the <substance> elements and search for matching IDs
for substance_element in root.findall(".//substance"):
    substance_id = substance_element.get("ID")
    print(f"Processing substance_id: {substance_id}")
    # Check if the ID without the extension is in the list
    base_substance_id = os.path.splitext(substance_id)[0]
    if base_substance_id in xml_file_names:
        # Print the XML file name found in the <substance> element's ID attribute
        print(f"Found XML file name '{substance_id}' in the other XML file.")
        match_found = True

        # Create a new <substance> element with modified attributes for IUC
        duplicate_substance_element_iuc = ET.Element("substance")
        duplicate_substance_element_iuc.set("ID", base_substance_id.replace("IAC", "IUC"))
        duplicate_substance_element_iuc.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iuc.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iuc.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IUC
        subname_element = substance_element.find("SubName")
        duplicate_subname_element_iuc = ET.Element("SubName")
        duplicate_subname_element_iuc.text = subname_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_subname_element_iuc)

        # Duplicate and modify the <url> element for IUC
        url_element = substance_element.find("url")
        duplicate_url_element_iuc = ET.Element("url")
        duplicate_url_element_iuc.text = url_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_url_element_iuc)

        # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 1, duplicate_substance_element_iuc)

        # Create a new <substance> element with modified attributes for IEC
        duplicate_substance_element_iec = ET.Element("substance")
        duplicate_substance_element_iec.set("ID", base_substance_id.replace("IAC", "IEC"))
        duplicate_substance_element_iec.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iec.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iec.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IEC
        duplicate_subname_element_iec = ET.Element("SubName")
        duplicate_subname_element_iec.text = subname_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_subname_element_iec)

        # Duplicate and modify the <url> element for IEC
        duplicate_url_element_iec = ET.Element("url")
        duplicate_url_element_iec.text = url_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_url_element_iec)
        
         # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 2, duplicate_substance_element_iec)

        # Append the duplicated IEC <substance> element to the list
        #duplicated_substance_elements.append(duplicate_substance_element_iec)

# Check if no matches were found and print "Not found" message
if not match_found:
    print("No XML file names were found in the other XML file.")
    
# # Append the duplicated IEC <substance> elements to the end
# for duplicate_element in duplicated_substance_elements:
#     root.append(duplicate_element)

# Print the modified XML content
modified_xml_content = ET.tostring(root, encoding="unicode")
print(modified_xml_content)

我收到此错误:

<Element 'substance' at 0x000002BF2DFE8720> is not in list

在这一行代码中

substance_element_index = list(root).index(substance_element)

我想要的输出是这样的:

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        **<substance ID="IEC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>**
        </available_substances>
        </Document>

我有一个xml文件,其中的内容如下所示:

xml_content_to_search =

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        </available_substances>
        </Document>

我想在 xml 文件中搜索特定的物质 id,然后复制它并进行一些操作,我能够实现它。 但是在复制之后,我想将复制的元素插入到复制它的物质 ID 的正下方。

这是我的代码:

# Use the os.listdir() method to list all files in the specified folder and filter for XML files
for filename in os.listdir(IAC_files_path):
    if filename.endswith(".xml"):
        # Remove the ".xml" extension before adding to the list
        xml_file_names.append(os.path.splitext(filename)[0])

# Parse the XML content to search for <substance> elements with matching IDs
tree = ET.ElementTree(ET.fromstring(xml_content_to_search))
root = tree.getroot()

# Initialize a flag to check if at least one match is found
match_found = False

# Create a list to store duplicated <substance> elements
duplicated_substance_elements = []

# Iterate through the <substance> elements and search for matching IDs
for substance_element in root.findall(".//substance"):
    substance_id = substance_element.get("ID")
    print(f"Processing substance_id: {substance_id}")
    # Check if the ID without the extension is in the list
    base_substance_id = os.path.splitext(substance_id)[0]
    if base_substance_id in xml_file_names:
        # Print the XML file name found in the <substance> element's ID attribute
        print(f"Found XML file name '{substance_id}' in the other XML file.")
        match_found = True

        # Create a new <substance> element with modified attributes for IUC
        duplicate_substance_element_iuc = ET.Element("substance")
        duplicate_substance_element_iuc.set("ID", base_substance_id.replace("IAC", "IUC"))
        duplicate_substance_element_iuc.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iuc.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iuc.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IUC
        subname_element = substance_element.find("SubName")
        duplicate_subname_element_iuc = ET.Element("SubName")
        duplicate_subname_element_iuc.text = subname_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_subname_element_iuc)

        # Duplicate and modify the <url> element for IUC
        url_element = substance_element.find("url")
        duplicate_url_element_iuc = ET.Element("url")
        duplicate_url_element_iuc.text = url_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_url_element_iuc)

        # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 1, duplicate_substance_element_iuc)

        # Create a new <substance> element with modified attributes for IEC
        duplicate_substance_element_iec = ET.Element("substance")
        duplicate_substance_element_iec.set("ID", base_substance_id.replace("IAC", "IEC"))
        duplicate_substance_element_iec.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iec.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iec.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IEC
        duplicate_subname_element_iec = ET.Element("SubName")
        duplicate_subname_element_iec.text = subname_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_subname_element_iec)

        # Duplicate and modify the <url> element for IEC
        duplicate_url_element_iec = ET.Element("url")
        duplicate_url_element_iec.text = url_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_url_element_iec)
        
         # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 2, duplicate_substance_element_iec)

        # Append the duplicated IEC <substance> element to the list
        #duplicated_substance_elements.append(duplicate_substance_element_iec)

# Check if no matches were found and print "Not found" message
if not match_found:
    print("No XML file names were found in the other XML file.")
    
# # Append the duplicated IEC <substance> elements to the end
# for duplicate_element in duplicated_substance_elements:
#     root.append(duplicate_element)

# Print the modified XML content
modified_xml_content = ET.tostring(root, encoding="unicode")
print(modified_xml_content)

我收到此错误:

<Element 'substance' at 0x000002BF2DFE8720> is not in list

在这一行代码中

substance_element_index = list(root).index(substance_element)

我想要的输出是这样的:

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        **<substance ID="IEC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>**
        </available_substances>
        </Document>
python jupyter-notebook xml 解析 数据操作

评论

0赞 Hermann12 9/29/2023
请编辑您的问题并删除不必要的重复文本。
0赞 Parfait 10/1/2023
请包括所有行。import

答:

4赞 Hermann12 9/29/2023 #1

您可以复制元素,更改内容并插入它:

import xml.etree.ElementTree as ET
from copy import deepcopy

tree = ET.parse('substance.xml')
root = tree.getroot()
    
sub = root.findall('.//substance')
print(len(sub))

co = deepcopy(sub[3])
for elem in co.iter():
    if elem.tag == 'substance':
        elem.set('ID', 'THC0004Y0101_insert')
        elem.set('DD', '27')
        elem.set('MM', '11')
        elem.set('YYYY', '1998')
    if elem.tag == 'SubName':
        elem.text = 'iso'
    if elem.tag == 'url':
        elem.text = 'ISO/ADR0004_010x.xml'
        
root.find('.//available_substances').insert(4, co)     

ET.dump(root)

输出:


<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101_insert" DD="27" MM="11" YYYY="1998">
            <SubName>iso</SubName>
            <url>ISO/ADR0004_010x.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        </available_substances>
        </Document>