Python xpath,JsonPath,bs4的基本使用

Posted on 2022-07-03 by WalkonNet

1.xpath

1.1 xpath使用

google提前安裝xpath插件，按ctrl + shift + x 出現小黑框
安裝lxml庫 pip install lxml ‐i https://pypi.douban.com/simple
導入lxml.etreefrom lxml import etree
etree.parse() 解析本地文件html_tree = etree.parse('XX.html')
etree.HTML() 服務器響應文件html_tree = etree.HTML(response.read().decode('utf‐8')
.html_tree.xpath(xpath路徑)

1.2 xpath基本語法

1.路徑查詢

查找所有子孫節點，不考慮層級關系
找直接子節點

2.謂詞查詢

//div[@id] 
//div[@id="maincontent"]

3.屬性查詢

//@class

4.模糊查詢

//div[contains(@id, "he")] 
//div[starts‐with(@id, "he")]

5.內容查詢

//div/h1/text()

6.邏輯運算

//div[@id="head" and @class="s_down"] 
//title | //price

1.3 示例

xpath.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8"/>
    <title>Title</title>
</head>
<body>
    <ul>
        <li id="l1" class="class1">北京</li>
        <li id="l2" class="class2">上海</li>
        <li id="d1">廣州</li>
        <li>深圳</li>
    </ul>
</body>
</html>

from lxml import etree

# xpath解析
# 本地文件：                                          etree.parse
# 服務器相應的數據    response.read().decode('utf-8')  etree.HTML()


tree = etree.parse('xpath.html')

# 查找url下邊的li
li_list = tree.xpath('//body/ul/li')
print(len(li_list))  # 4

# 獲取標簽中的內容
li_list = tree.xpath('//body/ul/li/text()')
print(li_list)  # ['北京', '上海', '廣州', '深圳']

# 獲取帶id屬性的li
li_list = tree.xpath('//ul/li[@id]')
print(len(li_list))  # 3

# 獲取id為l1的標簽內容
li_list = tree.xpath('//ul/li[@id="l1"]/text()')
print(li_list)  # ['北京']

# 獲取id為l1的class屬性值
c1 = tree.xpath('//ul/li[@id="l1"]/@class')
print(c1)  # ['class1']

# 獲取id中包含l的標簽
li_list = tree.xpath('//ul/li[contains(@id, "l")]/text()')
print(li_list)  # ['北京', '上海']
# 獲取id以d開頭的標簽
li_list = tree.xpath('//ul/li[starts-with(@id,"d")]/text()')
print(li_list)  # ['廣州']
# 獲取id為l2並且class為class2的標簽
li_list = tree.xpath('//ul/li[@id="l2" and @class="class2"]/text()')
print(li_list)  # ['上海']
# 獲取id為l2或id為d1的標簽
li_list = tree.xpath('//ul/li[@id="l2"]/text() | //ul/li[@id="d1"]/text()')
print(li_list)  # ['上海', '廣州']

1.4 爬取百度搜索按鈕的value

import urllib.request
from lxml import etree
url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
tree = etree.HTML(content)
value = tree.xpath('//input[@id="su"]/@value')
print(value)

1.5 爬取站長素材的圖片

# 需求 下載的前十頁的圖片
# https://sc.chinaz.com/tupian/qinglvtupian.html   1
# https://sc.chinaz.com/tupian/qinglvtupian_page.html
import urllib.request
from lxml import etree
def create_request(page):
    if (page == 1):
        url = 'https://sc.chinaz.com/tupian/qinglvtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request
def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content
def down_load(content):
    #     下載圖片
    # urllib.request.urlretrieve('圖片地址','文件的名字')
    tree = etree.HTML(content)
    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')
    # 一般設計圖片的網站都會進行懶加載
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')
    print(src_list)
    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = 'https:' + src
        urllib.request.urlretrieve(url=url, filename='./loveImg/' + name + '.jpg')
if __name__ == '__main__':
    start_page = int(input('請輸入起始頁碼'))
    end_page = int(input('請輸入結束頁碼'))

    for page in range(start_page, end_page + 1):
        # (1) 請求對象的定制
        request = create_request(page)
        # （2）獲取網頁的源碼
        content = get_content(request)
        # （3）下載
        down_load(content)

2. JsonPath

2.1 pip安裝

pip install jsonpath

2.2 jsonpath的使用

obj = json.load(open('json文件', 'r', encoding='utf‐8')) 
ret = jsonpath.jsonpath(obj, 'jsonpath語法')

JSONPath語法元素和對應XPath元素的對比：

示例：

jsonpath.json

{ "store": {
    "book": [
      { "category": "修真",
        "author": "六道",
        "title": "壞蛋是怎樣練成的",
        "price": 8.95
      },
      { "category": "修真",
        "author": "天蠶土豆",
        "title": "鬥破蒼穹",
        "price": 12.99
      },
      { "category": "修真",
        "author": "唐傢三少",
        "title": "鬥羅大陸",
        "isbn": "0-553-21311-3",
        "price": 8.99
      },
      { "category": "修真",
        "author": "南派三叔",
        "title": "星辰變",
        "isbn": "0-395-19395-8",
        "price": 22.99
      }
    ],
    "bicycle": {
      "author": "老馬",
      "color": "黑色",
      "price": 19.95
    }
  }
}

import json
import jsonpath

obj = json.load(open('jsonpath.json', 'r', encoding='utf-8'))

# 書店所有書的作者
author_list = jsonpath.jsonpath(obj, '$.store.book[*].author')
print(author_list)  # ['六道', '天蠶土豆', '唐傢三少', '南派三叔']

# 所有的作者
author_list = jsonpath.jsonpath(obj, '$..author')
print(author_list)  # ['六道', '天蠶土豆', '唐傢三少', '南派三叔', '老馬']

# store下面的所有的元素
tag_list = jsonpath.jsonpath(obj, '$.store.*')
print(
    tag_list)  # [[{'category': '修真', 'author': '六道', 'title': '壞蛋是怎樣練成的', 'price': 8.95}, {'category': '修真', 'author': '天蠶土豆', 'title': '鬥破蒼穹', 'price': 12.99}, {'category': '修真', 'author': '唐傢三少', 'title': '鬥羅大陸', 'isbn': '0-553-21311-3', 'price': 8.99}, {'category': '修真', 'author': '南派三叔', 'title': '星辰變', 'isbn': '0-395-19395-8', 'price': 22.99}], {'author': '老馬', 'color': '黑色', 'price': 19.95}]

# store裡面所有東西的price
price_list = jsonpath.jsonpath(obj, '$.store..price')
print(price_list)  # [8.95, 12.99, 8.99, 22.99, 19.95]

# 第三個書
book = jsonpath.jsonpath(obj, '$..book[2]')
print(book)  # [{'category': '修真', 'author': '唐傢三少', 'title': '鬥羅大陸', 'isbn': '0-553-21311-3', 'price': 8.99}]

# 最後一本書
book = jsonpath.jsonpath(obj, '$..book[(@.length-1)]')
print(book)  # [{'category': '修真', 'author': '南派三叔', 'title': '星辰變', 'isbn': '0-395-19395-8', 'price': 22.99}]
# 	前面的兩本書
book_list = jsonpath.jsonpath(obj, '$..book[0,1]')
# book_list = jsonpath.jsonpath(obj,'$..book[:2]')
print(
    book_list)  # [{'category': '修真', 'author': '六道', 'title': '壞蛋是怎樣練成的', 'price': 8.95}, {'category': '修真', 'author': '天蠶土豆', 'title': '鬥破蒼穹', 'price': 12.99}]

# 條件過濾需要在（）的前面添加一個？
# 	 過濾出所有的包含isbn的書。
book_list = jsonpath.jsonpath(obj, '$..book[?(@.isbn)]')
print(
    book_list)  # [{'category': '修真', 'author': '唐傢三少', 'title': '鬥羅大陸', 'isbn': '0-553-21311-3', 'price': 8.99}, {'category': '修真', 'author': '南派三叔', 'title': '星辰變', 'isbn': '0-395-19395-8', 'price': 22.99}]
# 哪本書超過瞭10塊錢
book_list = jsonpath.jsonpath(obj, '$..book[?(@.price>10)]')
print(
    book_list)  # [{'category': '修真', 'author': '天蠶土豆', 'title': '鬥破蒼穹', 'price': 12.99}, {'category': '修真', 'author': '南派三叔', 'title': '星辰變', 'isbn': '0-395-19395-8', 'price': 22.99}]

3. BeautifulSoup

3.1 基本簡介

1.安裝

pip install bs4

2.導入

from bs4 import BeautifulSoup

3.創建對象

服務器響應的文件生成對象 soup = BeautifulSoup(response.read().decode(), 'lxml')
本地文件生成對象 soup = BeautifulSoup(open('1.html'), 'lxml')

註意：默認打開文件的編碼格式gbk所以需要指定打開編碼格式utf-8

3.2 安裝以及創建

1.根據標簽名查找節點 
	soup.a 【註】隻能找到第一個a 
		soup.a.name 
		soup.a.attrs 
2.函數 
	(1).find(返回一個對象) 
		find('a')：隻找到第一個a標簽
		find('a', title='名字') 
		find('a', class_='名字') 
	(2).find_all(返回一個列表) 
		find_all('a') 查找到所有的a 
		find_all(['a', 'span']) 返回所有的a和span 
		find_all('a', limit=2) 隻找前兩個a 
	(3).select(根據選擇器得到節點對象)【推薦】 
		1.element 
			eg:p 
		2..class 
			eg:.firstname 
		3.#id
			eg:#firstname 
		4.屬性選擇器 
			[attribute] 
				eg:li = soup.select('li[class]') 
			[attribute=value] 
				eg:li = soup.select('li[class="hengheng1"]') 
		5.層級選擇器 
			element element 
				div p 
			element>element 
				div>p 
			element,element 
				div,p 
					eg:soup = soup.select('a,span')

3.3 節點定位

1.根據標簽名查找節點 
	soup.a 【註】隻能找到第一個a 
		soup.a.name 
		soup.a.attrs 
2.函數 
	(1).find(返回一個對象) 
		find('a')：隻找到第一個a標簽
		find('a', title='名字') 
		find('a', class_='名字') 
	(2).find_all(返回一個列表) 
		find_all('a') 查找到所有的a 
		find_all(['a', 'span']) 返回所有的a和span 
		find_all('a', limit=2) 隻找前兩個a 
	(3).select(根據選擇器得到節點對象)【推薦】 
		1.element 
			eg:p 
		2..class 
			eg:.firstname 
		3.#id
			eg:#firstname 
		4.屬性選擇器 
			[attribute] 
				eg:li = soup.select('li[class]') 
			[attribute=value] 
				eg:li = soup.select('li[class="hengheng1"]') 
		5.層級選擇器 
			element element 
				div p 
			element>element 
				div>p 
			element,element 
				div,p 
					eg:soup = soup.select('a,span')

3.5 節點信息

(1).獲取節點內容：適用於標簽中嵌套標簽的結構 
	obj.string 
	obj.get_text()【推薦】 
(2).節點的屬性 
	tag.name 獲取標簽名 
		eg:tag = find('li) 
			print(tag.name) 
	tag.attrs將屬性值作為一個字典返回 
(3).獲取節點屬性 
	obj.attrs.get('title')【常用】 
	obj.get('title') 
	obj['title']

(1).獲取節點內容：適用於標簽中嵌套標簽的結構 
	obj.string 
	obj.get_text()【推薦】 
(2).節點的屬性 
	tag.name 獲取標簽名 
		eg:tag = find('li) 
			print(tag.name) 
	tag.attrs將屬性值作為一個字典返回 
(3).獲取節點屬性 
	obj.attrs.get('title')【常用】 
	obj.get('title') 
	obj['title']

3.6 使用示例

bs4.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>

    <div>
        <ul>
            <li id="l1">張三</li>
            <li id="l2">李四</li>
            <li>王五</li>
            <a href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" " class="a1">google</a>
            <span>嘿嘿嘿</span>
        </ul>
    </div>


    <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百度</a>

    <div id="d1">
        <span>
            哈哈哈
        </span>
    </div>

    <p id="p1" class="p1">呵呵呵</p>
</body>
</html>

from bs4 import BeautifulSoup
# 通過解析本地文件 來將bs4的基礎語法進行講解
# 默認打開的文件的編碼格式是gbk 所以在打開文件的時候需要指定編碼
soup = BeautifulSoup(open('bs4.html', encoding='utf-8'), 'lxml')
# 根據標簽名查找節點
# 找到的是第一個符合條件的數據
print(soup.a)  # <a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>
# 獲取標簽的屬性和屬性值
print(soup.a.attrs)  # {'href': '', 'id': '', 'class': ['a1']}
# bs4的一些函數
# （1）find
# 返回的是第一個符合條件的數據
print(soup.find('a'))  # <a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>
# 根據title的值來找到對應的標簽對象
print(soup.find('a', title="a2"))  # <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百度</a>

# 根據class的值來找到對應的標簽對象  註意的是class需要添加下劃線
print(soup.find('a', class_="a1"))  # <a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>

# （2）find_all  返回的是一個列表 並且返回瞭所有的a標簽
print(soup.find_all('a'))  # [<a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>, <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百度</a>]

# 如果想獲取的是多個標簽的數據 那麼需要在find_all的參數中添加的是列表的數據
print(soup.find_all(['a','span']))  # [<a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>, <span>嘿嘿嘿</span>, <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百</a><spa哈</span>]

# limit的作用是查找前幾個數據
print(soup.find_all('li', limit=2))  # [<li id="l1">張三</li>, <li id="l2">李四</li>]

# （3）select（推薦）
# select方法返回的是一個列表  並且會返回多個數據
print(soup.select('a'))  # [<a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>, <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百度</a>]

# 可以通過.代表class  我們把這種操作叫做類選擇器
print(soup.select('.a1'))  # [<a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>]

print(soup.select('#l1'))  # [<li id="l1">張三</li>]

# 屬性選擇器---通過屬性來尋找對應的標簽
# 查找到li標簽中有id的標簽
print(soup.select('li[id]'))  # [<li id="l1">張三</li>, <li id="l2">李四</li>]

# 查找到li標簽中id為l2的標簽
print(soup.select('li[id="l2"]'))  # [<li id="l2">李四</li>]

# 層級選擇器
#  後代選擇器
# 找到的是div下面的li
print(soup.select('div li'))  # [<li id="l1">張三</li>, <li id="l2">李四</li>, <li>王五</li>]

# 子代選擇器
#  某標簽的第一級子標簽
# 註意：很多的計算機編程語言中 如果不加空格不會輸出內容  但是在bs4中 不會報錯 會顯示內容
print(soup.select('div > ul > li'))  # [<li id="l1">張三</li>, <li id="l2">李四</li>, <li>王五</li>]

# 找到a標簽和li標簽的所有的對象
print(soup.select(
    'a,li'))  # [<li id="l1">張三</li>, <li id="l2">李四</li>, <li>王五</li>, <a class="a1" href="" id=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ">google</a>, <a href="" title=" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" a2">百度</a>]

# 節點信息
#    獲取節點內容
obj = soup.select('#d1')[0]
# 如果標簽對象中 隻有內容 那麼string和get_text()都可以使用
# 如果標簽對象中 除瞭內容還有標簽 那麼string就獲取不到數據 而get_text()是可以獲取數據
# 我們一般情況下  推薦使用get_text()
print(obj.string)  # None
print(obj.get_text())  # 哈哈哈

# 節點的屬性
obj = soup.select('#p1')[0]
# name是標簽的名字
print(obj.name)  # p
# 將屬性值左右一個字典返回
print(obj.attrs)  # {'id': 'p1', 'class': ['p1']}

# 獲取節點的屬性
obj = soup.select('#p1')[0]
#
print(obj.attrs.get('class'))  # ['p1']
print(obj.get('class'))  # ['p1']
print(obj['class'])  # ['p1']

3.7 解析星巴克產品名稱

import urllib.request
url = 'https://www.starbucks.com.cn/menu/'
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
from bs4 import BeautifulSoup
soup = BeautifulSoup(content,'lxml')
# //ul[@class="grid padded-3 product"]//strong/text()
# 一般先用xpath方式通過google插件寫好解析的表達式
name_list = soup.select('ul[class="grid padded-3 product"] strong')
for name in name_list:
    print(name.get_text())

到此這篇關於Python xpath,JsonPath,bs4的基本使用的文章就介紹到這瞭,更多相關Python xpath,JsonPath,bs4內容請搜索WalkonNet以前的文章或繼續瀏覽下面的相關文章希望大傢以後多多支持WalkonNet！

Python xpath,JsonPath,bs4的基本使用

目錄

1.xpath

1.1 xpath使用

1.2 xpath基本語法

1.3 示例

1.4 爬取百度搜索按鈕的value

2. JsonPath

2.1 pip安裝

2.2 jsonpath的使用

3. BeautifulSoup

3.1 基本簡介

3.2 安裝以及創建

3.3 節點定位

3.5 節點信息

3.6 使用示例

3.7 解析星巴克產品名稱

推薦閱讀：

發佈留言取消回覆

近期文章

目錄

1.xpath

1.1 xpath使用

1.2 xpath基本語法

1.3 示例

1.4 爬取百度搜索按鈕的value

2. JsonPath

2.1 pip安裝

2.2 jsonpath的使用

3. BeautifulSoup

3.1 基本簡介

3.2 安裝以及創建

3.3 節點定位

3.5 節點信息

3.6 使用示例

3.7 解析星巴克產品名稱

推薦閱讀：

發佈留言 取消回覆

近期文章

標籤

發佈留言取消回覆