1. 程式人生 > >python—BeautifulSoup

python—BeautifulSoup

BeautifulSoup

pip install beautifulsoup4
pip install lxml
解析器 使用方法 優勢 劣勢
Python標準庫 BeautifulSoup(markup, “html.parser”) Python的內建標準庫、執行速度適中 、文件容錯能力強 Python 2.7.3 or 3.2.2)前的版本中文容錯能力差
lxml HTML 解析器 BeautifulSoup(markup, “lxml”) 速度快、文件容錯能力強 需要安裝C語言庫
lxml XML 解析器 BeautifulSoup(markup, “xml”) 速度快、唯一支援XML的解析器 需要安裝C語言庫
html5lib BeautifulSoup(markup, “html5lib”) 最好的容錯性、以瀏覽器的方式解析文件、生成HTML5格式的文件 速度慢、不依賴外部擴充套件

基本的使用方法

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print(soup.prettify()) # 將htlm 返回成標準的格式---縮排等 print(soup.title.string)

標籤選擇器

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') # 生成一個物件 print(soup.title) # <title>The Dormouse's story</title> print(type(soup.title)) # <class 'bs4.element.Tag'> print(soup.head) # <head><title>The Dormouse's story</title></head> print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse's story</b></p> # 只輸出第一個p標籤

獲取名稱

print(soup.title.name)
# title

# <title>The Dormouse's story</title>

獲取屬性

print(soup.p.attrs['name'])
print(soup.p['name'])
"""
dromouse
dromouse

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
"""

獲取內容

print(soup.p.string)

"""
The Dormouse's story
<p clss="title" name="dromouse"><b>The Dormouse's story</b></p>
"""

可以巢狀選擇

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string)

"""
The Dormouse's story

<head><title>The Dormouse's story</title></head>
"""

子節點和子孫節點

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents) # 選擇子節點,返回一個子節點列表,只要是同級別。不管是標籤還是文字
html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children):
    print(i, child)
"""
.children 返回的是一個迭代器,需要迴圈遍歷使用
"""


# 獲取自孫節點,也是返回一個迭代器
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
    print(i, child)

父節點和祖先節點

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
# 父節點
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)


# 祖先節點
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.parents)))

兄弟節點

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.next_siblings))) # 後面的兄弟節點
print(list(enumerate(soup.a.previous_siblings))) # 前面的兄弟節點

標準選擇器

"""
find_all( name , attrs , recursive , text , **kwargs )
可根據標籤名、屬性、內容查詢文件
"""

name

# name 標籤名 返回一個列表
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')[0]))
"""
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>
"""

# 層層迭代的查詢
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.find_all('ul'):
    print(ul.find_all('li'))

attrs

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1" name="elements">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
"""
根絕屬性名和值 進行查詢

[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
"""


from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element')) #class_  因為class是python關鍵字

text

"""
根絕文字內容進行選擇

返回的也是內容
"""

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text='Foo'))
"""
['Foo', 'Foo']
"""

find 和 find_all

find( name , attrs , recursive , text , **kwargs )
find返回單個元素,find_all返回所有元素

其他

"""
find_parents() find_parent()
find_parents()返回所有祖先節點,find_parent()返回直接父節點

find_next_siblings() find_next_sibling()
find_next_siblings()返回後面所有兄弟節點,find_next_sibling()返回後面第一個兄弟節點。

find_previous_siblings() find_previous_sibling()
find_previous_siblings()返回前面所有兄弟節點,find_previous_sibling()返回前面第一個兄弟節點。

find_all_next() find_next()
find_all_next()返回節點後所有符合條件的節點, find_next()返回第一個符合條件的節點

find_all_previous() 和 find_previous()
find_all_previous()返回節點後所有符合條件的節點, find_previous()返回第一個符合條件的節點
"""

CSS選擇器

使用

# 通過select()直接傳入CSS選擇器即可完成選擇

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading')) # class的
print(soup.select('ul li')) #直接選擇標籤 ul裡的所有li標籤
print(soup.select('#list-2 .element')) #id=list-1 裡面的class=element
print(type(soup.select('ul')[0])) # <class 'bs4.element.Tag'>


"""

[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
<class 'bs4.element.Tag'>

"""

獲取屬性

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])
"""
list-1
list-1
list-2
list-2
"""

獲取內容

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
    print(li.get_text())
"""
Foo
Bar
Jay
Foo
Bar

"""

總結

"""
推薦使用lxml解析庫,必要時使用html.parser
標籤選擇篩選功能弱但是速度快
建議使用find()、find_all() 查詢匹配單個結果或者多個結果
如果對CSS選擇器熟悉建議使用select()
記住常用的獲取屬性和文字值的方法

"""