Python爬蟲,獲取,解析,存儲詳解
1.獲取數據
import requests def drg(url): try: head ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/\ 537.36 (KHTML, like Gecko) Chrome/\ 91.0.4472.164 Safari/537.36'} r = requests.get(url,headers=head) r.raise_for_status() # 如果狀態不是200,引發HTTPError異常 r.encoding = r.apparent_encoding return r.text except: return "產生異常" url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile" print(drg(url))
2.解析數據
import requests def login(): try: # 登錄之後界面的url urllogin="http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F" s=requests.session() r=s.post(urllogin,data=Form,headers=headers) r.encoding = r.apparent_encoding r.raise_for_status() return s except Exception as error: print(error) def get_html(s,url): try: r=s.get(url,headers=headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print(error) if __name__=="__main__": # 登錄之後的界面user-agent headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36", } # 跟著自己的改變 Form = { "username": "12608199000635", "password": "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69", "nonce": "6BA36BBB1F623279", "cnonce": "8257070573EFE28F" } lin=login() # 個人中心的網址 url="http://www.cqooc.com/my/learn" html=get_html(lin,url) print(html)
3.數據保存為CSV格式和存入數據庫
保存為CSV
import requests from lxml import etree import csv #獲取數據 def get_html(url,time=30): try: r = requests.get(url, timeout=time) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print(error) def parser(html): #解析函數 doc=etree.HTML(html) #html轉換為soup對象 out_list=[] #解析函數輸出數據的列表 #二次查找法 for row in doc.xpath("//*[@class='book-img-text']//li/*[@class='book-mid-info']"): row_data=[ row.xpath("h4/a/text()")[0], #書名 row.xpath("p[@class='author']/a/text()")[0], #作者 row.xpath("p[2]/text()")[0].strip(), #介紹 row.xpath("p[@class='update']/span/text()")[0] #更新日期 ] out_list.append(row_data) #將解析的每行數據插入到輸出列表中 return out_list def save_csv(item,path): #數據存儲,將list數據寫入文件,防止亂碼 with open(path, "a+", newline='',encoding="utf-8") as f: #創建utf8編碼文件 csv_write = csv.writer(f) #創建寫入對象 csv_write.writerows(item) #一次性寫入多行 if __name__=="__main__": for i in range(1,6): url="https://www.qidian.com/rank/fengyun?style=1&page={0}".format(i) html=get_html(url) #獲取網頁數據 out_list=parser(html) #解析網頁,輸出列表數據 save_csv(out_list,"d:\\book.csv") #數據存儲
存入數據庫
import pymysql import requests from lxml import etree def get_html(url, time=3000): try: headers ={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31" } r = requests.get(url, timeout=time,headers=headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as err: print(err) result = [] def parse_html(html): html = etree.HTML(html) for row in html.xpath('//*[@id="content"]/div/div[1]/ul/li'): Naame = row.xpath("div[2]/h2/a/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a score = row.xpath("div[2]/p[2]/span[2]/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2] price = row.xpath("div[2]/p[1]/text()")[0].strip().split("/")#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text() price= price[0] content= price[1] a=price[2] b= price[-1] detail = [Naame,score,price,content,a,b] result.append(detail) def join_all(sql_insert,vals,**dbinfo): try: connet = pymysql.connect(**dbinfo) cursor = connet.cursor() cursor.executemany(sql_insert,vals) connet.commit() print('添加成功!') except Exception as err: print(err) connet.rollback() cursor.close() if __name__=="__main__": for page in range(1,16): url="https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}".format(str(page)) parms ={ "host":"127.0.0.1", "port":3306, "user":"root", "passwd":"123456", "db":"db", "charset":"utf8" } html=get_html(url) parse_html(html) sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\ Values(%s,%s,%s,%s,%s,%s)" join_all(sql_insert,result,**parms) print(result)
總結
本篇文章就到這裡瞭,希望能夠給你帶來幫助,也希望您能夠多多關註WalkonNet的更多內容!
推薦閱讀:
- python百行代碼實現漢服圈圖片爬取
- python使用xpath獲取頁面元素的使用
- Python爬蟲之爬取二手房信息
- 端午節將至,用Python爬取粽子數據並可視化,看看網友喜歡哪種粽子吧!
- python爬取一組小姐姐圖片實例