1. 程式人生 > >python模塊之xml.etree.ElementTree

python模塊之xml.etree.ElementTree

pat symbol fun import 數據 pyhton hat print off

Python有三種方法解析XML,SAX,DOM,以及ElementTree
###1.SAX (simple API for XML )
pyhton 標準庫包含SAX解析器,SAX是一種典型的極為快速的工具,在解析XML時,不會占用大量內存。
但是這是基於回調機制的,因此在某些數據中,它會調用某些方法進行傳遞。這意味著必須為數據指定句柄,
以維持自己的狀態,這是非常困難的。


###2.DOM(Document Object Model)
與SAX比較,DOM典型的缺點是比較慢,消耗更多的內存,因為DOM會將整個XML數讀入內存中,並為樹
中的第一個節點建立一個對象。使用DOM的好處是你不需要對狀態進行追蹤,因為每一個節點都知道誰是它的
父節點,誰是子節點。但是DOM用起來有些麻煩。


###3.ElementTree(元素樹)
ElementTree就像一個輕量級的DOM,具有方便友好的API。代碼可用性好,速度快,消耗內存少,這裏主要
介紹ElementTree。

一 基本知識
1、插入節點
Element.insert(index, element) 、Element(tag[, attrib][, **extra]) 、SubElement(parent, tag[, attrib[, **extra]]) 、Element.append(subelement)
2、刪除節點
Element.remove(subelement) 刪除一個節點、Element.clear()刪除該節點下所有子節點
3、在節點中插入屬性
Element.set(key, value)

4、查找節點

a) Element.getiterator b) Element.getchildren c) Element.find d) Element.findall

#!/usr/bin/python
# -*- encoding: utf-8 -*-

import os
import sys
import os.path
import xml.etree.ElementTree as ET


def read_xml(xmlFile, destDir):
    # print ‘==----------------=‘,ET.parse(xmlFile)

    # 加載XML文件(2種方法,一是加載指定字符串,二是加載指定文件)c:\xml\1.xml
    
    ‘‘‘
    <?xml version="1.0" ?>
	<root>
	  <FILE_DIRECTORY NAME="ca002">
	   <FILE_DIRECTORY NAME="RT_CA">
		<FILE_NAME NAME="0000.obj">
			<COFF_FILE_HEAD BEGIN="0" END="20">
				  <Machine>X86</Machine>
				  <NumberOfSections>2</NumberOfSections>
				  <PointerToSymbolTable>21205</PointerToSymbolTable>
				  <NumberOfSymbols>107</NumberOfSymbols>
				  <SizeOfOptionalHeader>0</SizeOfOptionalHeader>
				  <Characteristics>0</Characteristics>
			  </COFF_FILE_HEAD>
			<COFF_IMAGE_SECTIONS>
				<COFF_IMAGE_SECTION INDEX="0">
				  <Name>.rdata</Name>
				  <SizeOfRawData>5064</SizeOfRawData>
				  <PointerToRawData>100</PointerToRawData>
				  <PointerToRelocations>0</PointerToRelocations>
				  <PointerToLinenumbers>0</PointerToLinenumbers>
				  <NumberOfRelocations>0</NumberOfRelocations>
				  <NumberOfLinenumbers>0</NumberOfLinenumbers>
				</COFF_IMAGE_SECTION>
			</COFF_IMAGE_SECTIONS>
		</FILE_NAME>
	   </FILE_DIRECTORY>
	  </FILE_DIRECTORY>
	</root>
    :param xmlFile: 
    :param destDir: 
    :return: 
    ‘‘‘
    
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    # root = ET.fromstring(xmlContent)
    dir1_nodes = root.getchildren()
    # create dir1
    for dir1_node in dir1_nodes:
        dir1 = destDir + os.path.sep + dir1_node.attrib[‘NAME‘]
        # print dir1
        if os.path.exists(dir1) == False:
            os.mkdir(dir1)
            # create dir2
        dir2_nodes = dir1_node.getchildren()
        for dir2_node in dir2_nodes:
            dir2 = dir1 + os.path.sep + dir2_node.attrib[‘NAME‘]
            if os.path.exists(dir2) == False:
                os.mkdir(dir2)
                # create file
            dir3_nodes = dir2_node.getchildren()
            for dir3_node in dir3_nodes:
                dir3 = dir2 + os.path.sep + dir3_node.attrib[‘NAME‘]

                # 參數w 會新建或覆蓋一個文件,
                f = open(dir3, ‘w‘)
                # 遍歷xml標簽name=***.obj
                prelen = 0
                dir4_nodes = dir3_node.getchildren()
                for dir4_node in dir4_nodes:
                    traversal(dir4_node, f, prelen)
                f.close()


def traversal(node, f, prelen):
    ‘‘‘‘‘recursively traversal the rest of xml‘s content‘‘‘
    length = node.getchildren()
    attrs = ‘‘
    texts = ‘‘
    if len(node.attrib) > 0:
        for key in node.attrib:
            attrs += str(key) + ":" + str(node.attrib[key]) + " "
        attrs = attrs[:-1]
        f.write(‘-‘ * prelen + node.tag + ‘(‘ + attrs + ‘)‘)
    else:
        f.write(‘-‘ * prelen + node.tag)
    if node.text != None:
        f.write(‘:‘ + node.text)
    f.write(‘\n‘)
    if length != 0:
        nodes = node.getchildren()
        prelen += 4
        for node1 in nodes:
            traversal(node1, f, prelen)


def parseXmls(filePath, destDir):
    ‘‘‘‘‘traversal xmls directory‘‘‘
    if os.path.isfile(filePath) and os.path.basename(filePath).endswith(‘.xml‘):
        # print ‘filePath===‘,filePath
        read_xml(filePath, destDir)
    else:
        for item in os.listdir(filePath):
            print item
            subpath = filePath + os.path.sep + item
            parseXmls(subpath, destDir)


def main():
    "Main function."
    # input xml dir
    while True:
        dir = raw_input("input the dir:")
        if not os.path.exists(dir):
            print("you input dir is not existed!")
            continue
        else:
            break
            # create the dir of dest path that using to store the parsing xmls
    ‘‘‘‘‘destDir = os.path.split(dir)[0]+os.sep+time.strftime(‘%Y%m%d‘)
        if not os.path.exists(destDir):
            os.mkdir(destDir) ‘‘‘

    destDir = os.path.split(dir)[0] + os.path.sep + os.path.basename(dir) + ‘xml‘

    if os.path.exists(destDir) == False:
        os.mkdir(destDir)
        # recall the function of parse the xmls
    parseXmls(dir, destDir)


if __name__ == ‘__main__‘:
    main()

  

python模塊之xml.etree.ElementTree