python 開心網和豆瓣日記爬取的小爬蟲

Posted on 2021-05-28 by WalkonNet

項目地址：

https://github.com/aturret/python-crawler-exercise

用到瞭BeautifulSoup4，請先安裝。

pip install beautifulsoup4

開心網日記爬取

kaixin001.py

使用

登錄開心網，瀏覽器F12看http請求的header，獲取自己的cookie。

填寫cookie，要爬的日記的url，要爬的總次數。走你。

之後會生成HTML文件，格式是<:title>-<YYYYMMDDHHMMSS>

代碼

# -*- coding: utf-8 -*-
from urllib.request import urlopen
import urllib.request
import urllib.parse #為瞭獲取HTTP response
from bs4 import BeautifulSoup #BS4
import string # 為瞭去掉空白字符
import time # 防止被殺cookie
import unicodedata # 字符修正
# 在這裡放第一個鏈接
urlx = '鏈接' #寫你想爬的文

def request(url):
    global urlx #引用外面的鏈接作為全局變量，後面還會取下一個進行循環的


# 使用urllib庫提交cookie獲取http響應
    headers = {
    'GET https':url,
    'Host':' www.kaixin001.com',
    'Connection':' keep-alive',
    'Upgrade-Insecure-Requests':' 1',
    'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Accept':' application/json, text/javascript, */*; q=0.01',
    'Accept-Language':' zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cookie':' ', #改成自己的cookie，自己瀏覽器打開網站F12調試，自己找http請求的header
    }
    request = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(request)
    contents = response.read()

# 使用BS4獲得所有HTMLtag
    bsObj = BeautifulSoup(contents,"html.parser")

# 使用BS4的find函數得到想要的東西：標題、發表時間和博客正文
    title = bsObj.find("b", attrs={"class":"f14"})
    titleT = bsObj.find("b", attrs={"class":"f14"}).get_text() #開心網日記的標題是一個b標簽，class屬性值是f14
    date = bsObj.find("span", attrs={"class":"c6"})
    dateT = bsObj.find("span", attrs={"class":"c6"}).get_text() #開心網日記的發表時間是一個span標簽，class屬性值是c6
    text = bsObj.find("div", attrs={"class":"textCont"})
    textT = bsObj.find("div", attrs={"class":"textCont"}).get_text() #開心網日記的正文是一個div標簽，class屬性值是textCont

  

# 測試輸出
    print(title)
    print(dateT)
    # print(text)
    
    
    

# 生成HTML文件。這裡直接用file.open()和file.write()瞭，也可以用jinja2之類的框架生成。
    remove = string.whitespace+string.punctuation
    table = str.maketrans(':','：',remove)

    fileTitle=str(titleT).replace(':','：').replace('''"''','''“''')+'-'+str(dateT).translate(table).replace('發表','')+'.html'

    print(fileTitle) #測試輸出

    f = open(fileTitle,'w',encoding="utf-8") #註意用utf-8編碼寫入，不然會因為一些舊博文采用的gbk編碼不兼容而出問題。

# 寫入message
    message = """
    <html>
    <head></head>
    <body>
    <h1>%s</h1>
    <b>%s</b>
    <br></br>
    %s
    </body>
    </html>"""%(title.get_text(),date.get_text(),unicodedata.normalize('NFD',text.prettify()))
    f.write(message)
    f.close()
    # webbrowser.open(fileTitle,new = 1)
   

# 定位下一篇博文的URL

    nextUrl=bsObj.find("a",text="下一篇 >").attrs["href"] #下一篇是一個a標簽，使用tag對象的attrs屬性取href屬性的值。開心網的日記系統裡，如果到瞭最後一篇日記，下一篇的鏈接內容是第一篇日記，所以不用擔心從哪篇日記開始爬。
    # print(nextUrl)
    urlx="http://www.kaixin001.com"+nextUrl
    print(urlx)


# 主循環，給爺爬
num=328 #設定要爬多少次。其實也可以寫個數組檢測重復然後中止的啦，但我懶得弄瞭。
for a in range(num):
    request(urlx)    
    print('We get '+str(a+1)+' in '+str(num))
    time.sleep(1) # 慢點，慢點。測試過程中出現瞭沒有設置限制爬一半cookie失效瞭的情況，可能是太快瞭被搞瞭。

豆瓣日記爬取

douban.py

使用

登錄豆瓣，瀏覽器F12看http請求的header，獲取自己的cookie。

填寫變量COOKIE，要爬的日記頁的url。走你。

之後會生成HTML文件，格式是<:title>-<YYYYMMDDHHMMSS>

代碼

# -*- coding: utf-8 -*-
from urllib.request import urlopen
import urllib.request
import urllib.parse #為瞭獲取HTTP response
from bs4 import BeautifulSoup #BS4
import string # 為瞭去掉空白字符
import unicodedata # 字符修正
import re
# 在這裡放鏈接
url = '' #寫你想爬的人 https://www.douban.com/people/xxx/notes 這樣
COOKIE = ''

def request(urlx):
    global url #引用外面的鏈接作為全局變量，後面還會取下一個進行循環的
    global boolean
    global COOKIE
# 使用urllib庫提交cookie獲取http響應
    headers = {
    'GET https':urlx,
    'Host':' www.douban.com',
    'Connection':' keep-alive',
    'Upgrade-Insecure-Requests':' 1',
    'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Accept':' application/json, text/javascript, */*; q=0.01',
    'Accept-Language':' zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cookie':COOKIE, #改成自己的cookie，自己瀏覽器打開網站F12調試，自己找http請求的header
    }
    request = urllib.request.Request(url=urlx,headers=headers)
    response = urllib.request.urlopen(request)
    contents = response.read()

# 使用BS4獲得所有HTMLtag
    bsObj = BeautifulSoup(contents,"html.parser")

# 使用BS4的find函數獲取當前頁面的所有日記鏈接
    article = bsObj.find("div", attrs={"class":"article"})
    titleSet = article.findAll("h3")
    # print(titleSet)
    for title in titleSet:
        titleText = title.findAll("a",attrs={"class":"j a_unfolder_n"})
        for link in titleText:
            noteUrl = str(link.attrs["href"])
            print(noteUrl)
            requestSinglePage(noteUrl)
    next = bsObj.find("a",text="後頁>")
    if next==None:
        print("結束瞭")
        boolean=1
    else:
        url = str(next.attrs["href"]).replace("&type=note","")
        print(url)

def requestSinglePage(urly):
    global COOKIE
    headers = {
        'GET https':urly,
        'Host':' www.douban.com',
        'Connection':' keep-alive',
        'Upgrade-Insecure-Requests':' 1',
        'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Accept':' application/json, text/javascript, */*; q=0.01',
        'Accept-Language':' zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cookie':COOKIE, #改成自己的cookie，自己瀏覽器打開網站F12調試，自己找http請求的header
    }
    request = urllib.request.Request(url=urly,headers=headers)
    response = urllib.request.urlopen(request)
    contents = response.read()
    # 使用BS4獲得所有HTMLtag
    bsObj = BeautifulSoup(contents,"html.parser")

# 使用BS4的find函數得到想要的東西：標題、發表時間和博客正文

    title = bsObj.find("h1").get_text()
    date = bsObj.find("span", attrs={"class":"pub-date"})
    dateT = bsObj.find("span", attrs={"class":"pub-date"}).get_text()
    text = bsObj.find("div", attrs={"id":"link-report"})
    # textT = bsObj.find("div", attrs={"class":"textCont"}).get_text()

# 測試輸出
    print(title)
    print(dateT)

    # 生成HTML文件。這裡直接用file.open()和file.write()瞭，也可以用jinja2之類的框架生成。
    remove = string.whitespace+string.punctuation # 去掉日期的標點符號
    table = str.maketrans(':','：',remove)

    fileTitle=str(title)+'-'+str(dateT).translate(table)+'.html'

    print(fileTitle) #測試輸出

    f = open(fileTitle,'w',encoding="utf-8") #註意用utf-8編碼寫入，不然會因為一些舊博文采用的gbk編碼不兼容而出問題。

    # 寫入message
    message = """
    <html>
    <head></head>
    <body>
    <h1>%s</h1>
    <b>%s</b>
    <br></br>
    %s
    </body>
    </html>"""%(title,dateT,unicodedata.normalize('NFD',text.prettify()))
    f.write(message)
    f.close()

# 主循環，給爺爬

boolean=0
while(boolean==0):
    a=1
    request(url)
    print('We finished page '+str(a)+' .')
    a+=1

Roadmap

豆瓣四月份時候還有bug，手機端可以看到全部日記，半年隱藏無效。最近修好瞭。

不過現在的隱藏依然沒有針對到具體的日記，或許可以想辦法通過其他手段爬下來。

以上就是python 開心網日記爬取的示例步驟的詳細內容，更多關於python 開心網日記爬取的資料請關註WalkonNet其它相關文章！

python 開心網和豆瓣日記爬取的小爬蟲

項目地址：

開心網日記爬取

使用

代碼

豆瓣日記爬取

使用

代碼

Roadmap

推薦閱讀：

發佈留言取消回覆

近期文章

項目地址：

開心網日記爬取

使用

代碼

豆瓣日記爬取

使用

代碼

Roadmap

推薦閱讀：

發佈留言 取消回覆

近期文章

標籤

發佈留言取消回覆