python模塊之xml.etree.ElementTree
Python有三種方法解析XML,SAX,DOM,以及ElementTree
###1.SAX (simple API for XML )
pyhton 標準庫包含SAX解析器,SAX是一種典型的極為快速的工具,在解析XML時,不會占用大量內存。
但是這是基於回調機制的,因此在某些數據中,它會調用某些方法進行傳遞。這意味著必須為數據指定句柄,
以維持自己的狀態,這是非常困難的。
###2.DOM(Document Object Model)
與SAX比較,DOM典型的缺點是比較慢,消耗更多的內存,因為DOM會將整個XML數讀入內存中,並為樹
中的第一個節點建立一個對象。使用DOM的好處是你不需要對狀態進行追蹤,因為每一個節點都知道誰是它的
父節點,誰是子節點。但是DOM用起來有些麻煩。
###3.ElementTree(元素樹)
ElementTree就像一個輕量級的DOM,具有方便友好的API。代碼可用性好,速度快,消耗內存少,這裏主要
介紹ElementTree。
一 基本知識
1、插入節點
Element.insert(index, element) 、Element(tag[, attrib][, **extra]) 、SubElement(parent, tag[, attrib[, **extra]]) 、Element.append(subelement)
2、刪除節點
Element.remove(subelement) 刪除一個節點、Element.clear()刪除該節點下所有子節點
3、在節點中插入屬性
Element.set(key, value)
4、查找節點
a) Element.getiterator b) Element.getchildren c) Element.find d) Element.findall
#!/usr/bin/python # -*- encoding: utf-8 -*- import os import sys import os.path import xml.etree.ElementTree as ET def read_xml(xmlFile, destDir): # print ‘==----------------=‘,ET.parse(xmlFile) # 加載XML文件(2種方法,一是加載指定字符串,二是加載指定文件)c:\xml\1.xml ‘‘‘ <?xml version="1.0" ?> <root> <FILE_DIRECTORY NAME="ca002"> <FILE_DIRECTORY NAME="RT_CA"> <FILE_NAME NAME="0000.obj"> <COFF_FILE_HEAD BEGIN="0" END="20"> <Machine>X86</Machine> <NumberOfSections>2</NumberOfSections> <PointerToSymbolTable>21205</PointerToSymbolTable> <NumberOfSymbols>107</NumberOfSymbols> <SizeOfOptionalHeader>0</SizeOfOptionalHeader> <Characteristics>0</Characteristics> </COFF_FILE_HEAD> <COFF_IMAGE_SECTIONS> <COFF_IMAGE_SECTION INDEX="0"> <Name>.rdata</Name> <SizeOfRawData>5064</SizeOfRawData> <PointerToRawData>100</PointerToRawData> <PointerToRelocations>0</PointerToRelocations> <PointerToLinenumbers>0</PointerToLinenumbers> <NumberOfRelocations>0</NumberOfRelocations> <NumberOfLinenumbers>0</NumberOfLinenumbers> </COFF_IMAGE_SECTION> </COFF_IMAGE_SECTIONS> </FILE_NAME> </FILE_DIRECTORY> </FILE_DIRECTORY> </root> :param xmlFile: :param destDir: :return: ‘‘‘ tree = ET.parse(xmlFile) root = tree.getroot() # root = ET.fromstring(xmlContent) dir1_nodes = root.getchildren() # create dir1 for dir1_node in dir1_nodes: dir1 = destDir + os.path.sep + dir1_node.attrib[‘NAME‘] # print dir1 if os.path.exists(dir1) == False: os.mkdir(dir1) # create dir2 dir2_nodes = dir1_node.getchildren() for dir2_node in dir2_nodes: dir2 = dir1 + os.path.sep + dir2_node.attrib[‘NAME‘] if os.path.exists(dir2) == False: os.mkdir(dir2) # create file dir3_nodes = dir2_node.getchildren() for dir3_node in dir3_nodes: dir3 = dir2 + os.path.sep + dir3_node.attrib[‘NAME‘] # 參數w 會新建或覆蓋一個文件, f = open(dir3, ‘w‘) # 遍歷xml標簽name=***.obj prelen = 0 dir4_nodes = dir3_node.getchildren() for dir4_node in dir4_nodes: traversal(dir4_node, f, prelen) f.close() def traversal(node, f, prelen): ‘‘‘‘‘recursively traversal the rest of xml‘s content‘‘‘ length = node.getchildren() attrs = ‘‘ texts = ‘‘ if len(node.attrib) > 0: for key in node.attrib: attrs += str(key) + ":" + str(node.attrib[key]) + " " attrs = attrs[:-1] f.write(‘-‘ * prelen + node.tag + ‘(‘ + attrs + ‘)‘) else: f.write(‘-‘ * prelen + node.tag) if node.text != None: f.write(‘:‘ + node.text) f.write(‘\n‘) if length != 0: nodes = node.getchildren() prelen += 4 for node1 in nodes: traversal(node1, f, prelen) def parseXmls(filePath, destDir): ‘‘‘‘‘traversal xmls directory‘‘‘ if os.path.isfile(filePath) and os.path.basename(filePath).endswith(‘.xml‘): # print ‘filePath===‘,filePath read_xml(filePath, destDir) else: for item in os.listdir(filePath): print item subpath = filePath + os.path.sep + item parseXmls(subpath, destDir) def main(): "Main function." # input xml dir while True: dir = raw_input("input the dir:") if not os.path.exists(dir): print("you input dir is not existed!") continue else: break # create the dir of dest path that using to store the parsing xmls ‘‘‘‘‘destDir = os.path.split(dir)[0]+os.sep+time.strftime(‘%Y%m%d‘) if not os.path.exists(destDir): os.mkdir(destDir) ‘‘‘ destDir = os.path.split(dir)[0] + os.path.sep + os.path.basename(dir) + ‘xml‘ if os.path.exists(destDir) == False: os.mkdir(destDir) # recall the function of parse the xmls parseXmls(dir, destDir) if __name__ == ‘__main__‘: main()
python模塊之xml.etree.ElementTree