Tinyxml解析過程原始碼分析
阿新 • • 發佈:2018-11-10
tinyxml是一個優秀的,易用的,開源的xml解析庫,xml解析的最關鍵之處,就是如何將xml檔案內容解析成記憶體中的可用、易用的程式資料---DOM(Document Object Model)樹。DOM其實就是多叉樹,每個節點只需知道自己的第一個子節點(first child)和下一個兄弟節點(next sibling),即可實現元素資料的解析。
有關tinyxml內部的結構設計,本文不詳述,網上已有很多分析,請參見http://www.cnblogs.com/kex1n/archive/2010/09/23/1833468.html。本文重點分析tinyxml是生成DOM樹的過程,完成這個任務的函式就是TiXmlDocument::LoadFile()函式,下面分析其程式碼實現。1.TiXmlDocument::LoadFile() 開啟xml檔案
/** * @brief TiXmlDocument::LoadFile * @param _filename xml檔名 * @param encoding 檔案編碼型別 * @return * * 這個方法只是開啟xml檔案,然後呼叫另一個LoadFile()方法 * */ bool TiXmlDocument::LoadFile( const char* _filename, TiXmlEncoding encoding ) { TIXML_STRING filename( _filename ); value = filename; // reading in binary mode so that tinyxml can normalize the EOL FILE* file = TiXmlFOpen( value.c_str (), "rb" ); if ( file ) { bool result = LoadFile( file, encoding ); fclose( file ); return result; } else { SetError( TIXML_ERROR_OPENING_FILE, 0, 0, TIXML_ENCODING_UNKNOWN ); return false; } }
2.TiXmlDocument::LoadFile() 讀取xml到陣列,統一換行符為\n
/** * @brief TiXmlDocument::LoadFile * @param file * @param encoding * @return * * 1.將檔案內容讀到一個字元陣列中 * 2.換行符統一替換成\n,檔案換行符在不同的系統實現不同,有\n,\r\n,\r三種形式。 * 3.呼叫Parse()方法 */ bool TiXmlDocument::LoadFile( FILE* file, TiXmlEncoding encoding ) { if ( !file ) { SetError( TIXML_ERROR_OPENING_FILE, 0, 0, TIXML_ENCODING_UNKNOWN ); return false; } // Delete the existing data: Clear(); location.Clear(); // Get the file size, so we can pre-allocate the string. HUGE speed impact. long length = 0; fseek( file, 0, SEEK_END ); length = ftell( file ); fseek( file, 0, SEEK_SET ); // Strange case, but good to handle up front. if ( length <= 0 ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); return false; } // Subtle bug here. TinyXml did use fgets. But from the XML spec: // 2.11 End-of-Line Handling // <snip> // <quote> // ...the XML processor MUST behave as if it normalized all line breaks in external // parsed entities (including the document entity) on input, before parsing, by translating // both the two-character sequence #xD #xA and any #xD that is not followed by #xA to // a single #xA character. // </quote> // // It is not clear fgets does that, and certainly isn't clear it works cross platform. // Generally, you expect fgets to translate from the convention of the OS to the c/unix // convention, and not work generally. /* while( fgets( buf, sizeof(buf), file ) ) { data += buf; } */ char* buf = new char[ length+1 ]; buf[0] = 0; if ( fread( buf, length, 1, file ) != 1 ) { delete [] buf; SetError( TIXML_ERROR_OPENING_FILE, 0, 0, TIXML_ENCODING_UNKNOWN ); return false; } // Process the buffer in place to normalize new lines. (See comment above.) // Copies from the 'p' to 'q' pointer, where p can advance faster if // a newline-carriage return is hit. // // Wikipedia: // Systems based on ASCII or a compatible character set use either LF (Line feed, '\n', 0x0A, 10 in decimal) or // CR (Carriage return, '\r', 0x0D, 13 in decimal) individually, or CR followed by LF (CR+LF, 0x0D 0x0A)... // * LF: Multics, Unix and Unix-like systems (GNU/Linux, AIX, Xenix, Mac OS X, FreeBSD, etc.), BeOS, Amiga, RISC OS, and others // * CR+LF: DEC RT-11 and most other early non-Unix, non-IBM OSes, CP/M, MP/M, DOS, OS/2, Microsoft Windows, Symbian OS // * CR: Commodore 8-bit machines, Apple II family, Mac OS up to version 9 and OS-9 const char* p = buf; // the read head char* q = buf; // the write head const char CR = 0x0d; const char LF = 0x0a; buf[length] = 0; while( *p ) { assert( p < (buf+length) ); assert( q <= (buf+length) ); assert( q <= p ); if ( *p == CR ) { *q++ = LF; p++; if ( *p == LF ) { // check for CR+LF (and skip LF) p++; } } else { *q++ = *p++; } } assert( q <= (buf+length) ); *q = 0; Parse( buf, 0, encoding ); //解析xml delete [] buf; return !Error(); }
3.TiXmlDocument::Parse() 解析整個xml文件,生成DOM樹
/**
* @brief TiXmlDocument::Parse
* @param p
* @param prevData
* @param encoding
* @return
*
* 完成DOM的建立
*
*/
const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
{
ClearError();
// Parse away, at the document level. Since a document
// contains nothing but other tags, most of what happens
// here is skipping white space.
if ( !p || !*p )
{
SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
return 0;
}
// Note that, for a document, this needs to come
// before the while space skip, so that parsing
// starts from the pointer we are given.
location.Clear();
if ( prevData )
{
location.row = prevData->cursor.row;
location.col = prevData->cursor.col;
}
else
{
location.row = 0;
location.col = 0;
}
TiXmlParsingData data( p, TabSize(), location.row, location.col );
location = data.Cursor();
if ( encoding == TIXML_ENCODING_UNKNOWN )
{
// Check for the Microsoft UTF-8 lead bytes.
const unsigned char* pU = (const unsigned char*)p;
if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
&& *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
&& *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
{
encoding = TIXML_ENCODING_UTF8;
useMicrosoftBOM = true;
}
}
/*這個方法的功能是判斷的當前的指標p指向的字元
*是不是空白字元(即空格或換行符),如果是,則指標
*前移,找到一個不是空白字元的字元,返回當前的指標位置
*如果不是,還返回這個指標
*
*由此可見這個方法也非常重要,不斷地跳過空白字元,不停地解析資料
*/
p = SkipWhiteSpace( p, encoding );
if ( !p )
{
SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
return 0;
}
/*
*重點在這裡
*解析xml字串,直到結束'\0'
*
*/
while ( p && *p )
{
/*根據頭部判斷當前的指標指向哪種節點,然後new一個
*相應的節點,並返回該節點指標,並且設定該節點的父節點為this
*TiXmlNode是一個基類
*是對xml的元素、註釋、文字、文件宣告的抽象
*/
TiXmlNode* node = Identify( p, encoding );
if ( node )
{
/*下面是多型執行的,不同的節點型別,實現是不同的
*假設node是一個元素節點,那麼這個元素就會有屬性
*就會有子元素等資訊,所以要繼續解析,因此這個node也有子節點,
*直到這個節點,這就是多叉樹形成的原因
*直到這個節點內容結束,返回當前位置指標。
*/
p = node->Parse( p, &data, encoding );
/*
*將這個節點,連線到父節點樹上
*/
LinkEndChild( node );
}
else
{
break;
}
// Did we get encoding info?
if ( encoding == TIXML_ENCODING_UNKNOWN
&& node->ToDeclaration() )
{
TiXmlDeclaration* dec = node->ToDeclaration();
const char* enc = dec->Encoding();
assert( enc );
if ( *enc == 0 )
encoding = TIXML_ENCODING_UTF8;
else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
encoding = TIXML_ENCODING_UTF8;
else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
else
encoding = TIXML_ENCODING_LEGACY;
}
p = SkipWhiteSpace( p, encoding );
}
// Was this empty?
if ( !firstChild ) {
SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
return 0;
}
// All is well.
return p;
}
4.TiXmlElement::Parse() 解析元素,生成元素多叉樹,還有註釋類,文件宣告類實現的Parse()方法,在此省略不述
/**
* @brief TiXmlElement::Parse
* @param p
* @param data
* @param encoding
* @return
*
*是基類TiXmlNode::Parse()的一種實現,用來解析元素型別的多叉樹
*
*/
const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
{
p = SkipWhiteSpace( p, encoding );
TiXmlDocument* document = GetDocument();
if ( !p || !*p )
{
if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
return 0;
}
if ( data )
{
data->Stamp( p, encoding );
location = data->Cursor();
}
if ( *p != '<' )
{
if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
return 0;
}
p = SkipWhiteSpace( p+1, encoding );
// Read the name.
const char* pErr = p;
//獲取元素名 (value是類成員)
p = ReadName( p, &value, encoding );
if ( !p || !*p )
{
if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
return 0;
}
TIXML_STRING endTag ("</");
//獲取這個元素的結束標記
endTag += value;
// Check for and read attributes. Also look for an empty
// tag or an end tag.
while ( p && *p )
{
pErr = p;
p = SkipWhiteSpace( p, encoding );
if ( !p || !*p )
{
if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
return 0;
}
if ( *p == '/' )
{
++p;
// Empty tag.
if ( *p != '>' )
{
if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
return 0;
}
return (p+1);
}
// 讀取元素的值
else if ( *p == '>' )
{
// Done with attributes (if there were any.)
// Read the value -- which can include other
// elements -- read the end tag, and return.
++p;
//有可能這個元素沒有值,接著又是子元素,如<Person><Boy>Jim</Boy></Person>
p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
if ( !p || !*p ) {
// We were looking for the end tag, but found nothing.
// Fix for [ 1663758 ] Failure to report error on bad XML
if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
return 0;
}
// We should find the end tag now
// note that:
// </foo > and
// </foo>
// are both valid end tags.
if ( StringEqual( p, endTag.c_str(), false, encoding ) )
{
p += endTag.length();
p = SkipWhiteSpace( p, encoding );
if ( p && *p && *p == '>' ) {
++p;
return p;
}
if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
return 0;
}
else
{
if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
return 0;
}
}
//讀取元素的屬性
else
{
// Try to read an attribute:
TiXmlAttribute* attrib = new TiXmlAttribute();
if ( !attrib )
{
return 0;
}
attrib->SetDocument( document );
pErr = p;
p = attrib->Parse( p, data, encoding );
if ( !p || !*p )
{
if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
delete attrib;
return 0;
}
// Handle the strange case of double attributes:
#ifdef TIXML_USE_STL
TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
#else
TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
#endif
if ( node )
{
if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
delete attrib;
return 0;
}
attributeSet.Add( attrib );
}
}
return p;
}
5.TiXmlElement::ReadValue() 讀取元素的值,解析子元素
/**
* @brief TiXmlElement::ReadValue
* @param p
* @param data
* @param encoding
* @return
* 讀取元素的值和解析子元素
*
*/
const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
{
TiXmlDocument* document = GetDocument();
// Read in text and elements in any order.
const char* pWithWhiteSpace = p;
p = SkipWhiteSpace( p, encoding );
while ( p && *p )
{
if ( *p != '<' )
{
// Take what we have, make a text element.
TiXmlText* textNode = new TiXmlText( "" );
if ( !textNode )
{
return 0;
}
if ( TiXmlBase::IsWhiteSpaceCondensed() )
{
p = textNode->Parse( p, data, encoding );
}
else
{
// Special case: we want to keep the white space
// so that leading spaces aren't removed.
p = textNode->Parse( pWithWhiteSpace, data, encoding );
}
if ( !textNode->Blank() )
LinkEndChild( textNode );
else
delete textNode;
}
//一個子元素標籤的開始,解析子元素
else
{
// We hit a '<'
// Have we hit a new element or an end tag? This could also be
// a TiXmlText in the "CDATA" style.
if ( StringEqual( p, "</", false, encoding ) )
{
return p;
}
else
{
TiXmlNode* node = Identify( p, encoding );
if ( node )
{
p = node->Parse( p, data, encoding );
LinkEndChild( node );
}
else
{
return 0;
}
}
}
pWithWhiteSpace = p;
p = SkipWhiteSpace( p, encoding );
}
if ( !p )
{
if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
}
return p;
}