1. 程式人生 > >通過xsd schema結構來驗證xml是否合法

通過xsd schema結構來驗證xml是否合法

 1 import sys
 2 import StringIO
 3 import lxml
 4 
 5 from lxml import etree
 6 from StringIO import StringIO
 7 
 8 # Construct XML relevant to the XML schema we're validating against. By altering the string, adding/removing elements
 9 # we can force different errors to occur when validating.
10 xml = StringIO('''
11 <CompanyDataRequest xmlns="http://xmlgw.companieshouse.gov.uk" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://xmlgw.companieshouse.gov.uk http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd"> 12 <CompanyNumber>06937730</CompanyNumber> 13 <CompanyAuthenticationCode>123456</CompanyAuthenticationCode>
14 <MadeUpDate>2010-06-30x</MadeUpDate> 15 </CompanyDataRequest> 16 ''') 17 18 # Clear any previous errors 19 lxml.etree.clear_error_log() 20 21 try: 22 # Get the XML schema to validate against 23 schema = lxml.etree.XMLSchema(file = 'http://xmlgw.companieshouse.gov.uk/v2-1/schema/CompanyData-v2-2.xsd
') 24 # Parse string of XML 25 xml_doc = lxml.etree.parse(xml) 26 # Validate parsed XML against schema returning a readable message on failure 27 schema.assertValid(xml_doc) 28 # Validate parsed XML against schema returning boolean value indicating success/failure 29 print 'schema.validate() returns "%s".' % schema.validate(xml_doc) 30 31 except lxml.etree.XMLSchemaParseError, xspe: 32 # Something wrong with the schema (getting from URL/parsing) 33 print "XMLSchemaParseError occurred!" 34 print xspe 35 36 except lxml.etree.XMLSyntaxError, xse: 37 # XML not well formed 38 print "XMLSyntaxError occurred!" 39 print xse 40 41 except lxml.etree.DocumentInvalid, di: 42 # XML failed to validate against schema 43 print "DocumentInvalid occurred!" 44 45 error = schema.error_log.last_error 46 if error: 47 # All the error properties (from libxml2) describing what went wrong 48 print 'domain_name: ' + error.domain_name 49 print 'domain: ' + str(error.domain) 50 print 'filename: ' + error.filename # '<string>' cos var is a string of xml 51 print 'level: ' + str(error.level) 52 print 'level_name: ' + error.level_name # an integer 53 print 'line: ' + str(error.line) # a unicode string that identifies the line where the error occurred. 54 print 'message: ' + error.message # a unicode string that lists the message. 55 print 'type: ' + str(error.type) # an integer 56 print 'type_name: ' + error.type_name

封裝類

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 # Author:Eric.yue
 4 
 5 import os
 6 import lxml.etree as ET
 7 from StringIO import StringIO
 8 import chardet
 9 
10 
11 class R3xmlCheck(object):
12     def __init__(self, element_xml):
13         self.elem_xml = element_xml
14 
15     def validate_xsd_xml(self, f_xml, elem_xsd):
16         try:
17             elem_xsd = elem_xsd.encode('utf-8')
18             xsd_doc = StringIO(elem_xsd)
19             xml_doc = StringIO(f_xml)
20             xmlschema_doc = ET.parse(xsd_doc)
21             xmlschema = ET.XMLSchema(xmlschema_doc)
22             xml = ET.parse(xml_doc)
23             xmlschema.assertValid(xml)
24             print 'schema.validate() returns "%s".' % xmlschema.validate(xml)
25 
26         except ET.XMLSchemaParseError, xspe:
27             # Something wrong with the schema (getting from URL/parsing)
28             print "XMLSchemaParseError occurred!"
29             print xspe
30 
31         except ET.XMLSyntaxError, xse:
32             # XML not well formed
33             print "XMLSyntaxError occurred!"
34             print xse
35 
36         except ET.DocumentInvalid, di:
37             # XML failed to validate against schema
38             print "DocumentInvalid occurred!"
39 
40             error = xmlschema.error_log.last_error
41             if error:
42                 # All the error properties (from libxml2) describing what went wrong
43                 print 'domain_name: ' + error.domain_name
44                 print 'domain: ' + str(error.domain)
45                 print 'filename: ' + error.filename  # '<string>' cos var is a string of xml
46                 print 'level: ' + str(error.level)
47                 print 'level_name: ' + error.level_name  # an integer
48                 print 'line: ' + str(error.line)  # a unicode string that identifies the line where the error occurred.
49                 print 'message: ' + error.message  # a unicode string that lists the message.
50                 print 'type: ' + str(error.type)  # an integer
51                 print 'type_name: ' + error.type_name
52 
53     def run(self):
54         res = self.validate_xml(self.elem_xml)
55         if res["result"] is not True:
56             return res["info"]
57 
58         elem_xsd = self.get_xsd()
59 
60         with open(self.elem_xml) as f:
61             f_xml = f.read()
62             chardet_info = chardet.detect(f_xml)
63             if chardet_info['encoding'] == 'ascii':
64                 f_xml = f_xml.encode('utf-8')
65             self.validate_xsd_xml(f_xml.strip(),elem_xsd)
66 
67     # matching schemaLocation url
68     def get_xsd(self):
69         with open("./xsd/multicacheschemas/MCCI_IN200100UV01.xsd") as f:
70             elem_xsd = f.read()
71             return elem_xsd
72 
73     def validate_xml(self, exml):
74         rinfo = {}
75         if os.path.exists(exml):
76             try:
77                 ET.parse(exml)
78                 rinfo['result'] = True
79             except Exception as err:
80                 rinfo['result'] = False
81                 rinfo['info'] = 'Parsing error info:{0}'.format(err)
82         return rinfo
83 
84 if __name__ == "__main__":
85     aa = R3xmlCheck("./xsd/aa.xml")
86     aa.run()