python制作微博圖片爬取工具

Posted on 2021-01-16 by WalkonNet

有小半個月沒有發博客瞭，因為一直在研究python的GUI，買瞭一本書學習瞭一些基礎，用我所學做瞭我的第一款GUI——微博圖片爬取工具。本軟件源代碼已經放在瞭博客中，另外軟件已經打包好上傳到網盤中以供下載學習。

一．準備工作

本次要用到以下依賴庫：re json os random tkinter threading requests PIL 其中後兩個需要安裝後使用

二．預覽

1.啟動

2.運行中

3.結果

這裡隻將拿一張圖片作為展示。

三．設計流程

設計流程分為總體設計和詳細設計，這裡我會使用viso畫出幾個流程圖，用以展示我的思路，其中詳細設計部分，我列舉瞭兩個函數實現的具體流程。

1.總體設計

此圖為整個系統的整體流程也是本GUI軟件的使用過程。

2.詳細設計

在此列舉兩個函數一個是搜索按鈕觸發的wb_search函數，一個是開始爬取按鈕觸發的wb_pics_parse函數。

2.1wb_search函數

2.2wb_pics_parse函數

四．源代碼

import json
import random
import re
import os
from tkinter import *
from tkinter import messagebox
from tkinter import ttk
import requests
import threading
from PIL import Image,ImageTk

"""
1.07使用check button 實現下載完打開文件夾操作，註冊瞭enter、esc熱鍵，優化瞭一些體驗
1.08 1.更新瞭關鍵字、磁盤、用戶判斷邏輯
   2.將之前的線程池改為多線程來執行下載操作
1.13說明：如果在下載過程變慢，可能是軟件正在解析圖片地址或者就是您的網絡不行
"""
class WeiBo_pics_Spider(object):
  def __init__(self,start_url):
    self.start_url=start_url

  #解析出圖片地址
  def get_pics_url(self):
    i = 1
    global a_flag
    a_flag = True
    while True:
      url = self.start_url + '&page={}'.format(i)
      headers = {'User-Agent': get_ua()}
      r = requests.get(url, headers=headers)
      _json = json.loads(r.text)
      items = _json["data"]["cards"]
      flag = _json['ok']
      if flag == 1 and a_flag: # 爬取數據標志+一個手動控制標志
        for v in items:
          picslist = v.get('mblog')
          if picslist is not None:
            img_urls = picslist.get('pics')
            if img_urls != None:
              for img_url_ in img_urls:
                img_url = img_url_['large']['url']
                yield img_url
      else:
        #1.06頁數顯示出現問題
        t1.insert(END, f'***在第{i}頁終止***\n')
        t1.see(END)
        t1.update()
        if r1_var.get() == 1:
          big_dir=disk+':/WeiBo_Pics'
          os.startfile(big_dir)
        break
      i += 1
  #下載圖片
  def download_pics(self,url,filename):
    headers={'User-Agent': get_ua()}
    r = requests.get(url, headers=headers)
    big_dir=disk+':/WeiBo_Pics'
    aim_path=big_dir+'/'+user_name_selected
    try:
      os.makedirs(aim_path)
    except:
      pass
    with open(aim_path + '\\' + filename, 'wb')as f:
      f.write(r.content)
      # 保證焦點始終在最下
      t1.see(END)
      # 下載完一張刷新一次 防止界面卡死崩潰
      t1.insert(END, f'{filename}\n')
      window.update()


def get_ua():
  first_num = random.randint(55, 62)
  third_num = random.randint(0, 3200)
  fourth_num = random.randint(0, 140)
  os_type = [
    '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
    '(Macintosh; Intel Mac OS X 10_12_6)'
  ]
  chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)

  ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
          '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
         )
  return ua


def wb_search():
  #先清空lsibox1內容，便於新內容顯示
  listb1.delete(0,END)
  url1='https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D3%26q%3D{}%26t%3D0'
  headers={'User-Agent':get_ua()}
  key_word = e1.get()
  global user_id_list
  user_id_list=list()
  if len(key_word)!=0:
    #若用戶輸入瞭user_id，則去獲取screen_name
    if re.match('\d{10}',key_word):
      user_id_list.append(key_word)
      url2 = f'https://m.weibo.cn/api/container/getIndex?uid={key_word}&containerid=100505{key_word}'
      r1 = requests.get(url2, headers=headers)
      _data = json.loads(r1.text)
      screen_name = _data['data']['userInfo'].get('screen_name')
      l3.place(x=120, y=42)
      l3_var.set(f'搜索成功')
      l3['background'] = 'green'
      listb1.insert(END, screen_name)
    #否則根據關鍵字去搜索用戶信息，顯示在listbox中
    else:
      aim_url=url1.format(key_word)
      r=requests.get(aim_url,headers=headers)
      _json=json.loads(r.text)
      try:
        #若出現瞭IndexError則表明沒有檢索到用戶信息
        users=_json['data']['cards'][1].get('card_group')
        relevant_num=len(users)
        l3.place(x=105, y=42)
        l3_var.set(f'搜索到瞭 {relevant_num} 個用戶')
        l3['background']='green'
        for user_ in users:
          user_info=user_.get('user')
          user_name=user_info.get('screen_name')
          id = user_info.get('id')
          """
          1.02的一種思路，使用一個列表存儲screen_name和uid，兩者用;(自定義字符，但應避免較少沖突)
          當獲取Uid時，直接切割字符串，取Listbox所選項索引，按索引在列表表值（uid）
          #使用字符串拼接 格式：screen_name+';'+str(id)
          # user_data = user_name + ';' + str(id)
          """
          user_id_list.append(id)
          listb1.insert(END,user_name)
      except IndexError:#如果沒有檢索到用戶，就會報列表索引錯誤
        messagebox.showinfo(title='提示', message='沒有檢索到相關用戶，請更換關鍵字或使用用戶id搜索！')
        l3.place(x=85, y=42)
        l3_var.set(f'請更換關鍵字或用戶id搜索！')
        l3['background']='yellow'
        #沒有檢索到用戶的話，提示之後，e1獲得焦點之後，清除用戶之前輸入
        e1.bind('WM_TAKE_FOCUS', e1_clear())
  else:#處理沒有輸入關鍵字
    messagebox.showinfo(title='info',message='請輸入關鍵字！')
    l3.place(x=110, y=42)
    l3_var.set(f'請輸入關鍵字！')
    l3['background'] = 'red'

def wb_pics_parse():
  key_word=e1.get()
  select_path=c1.get()
  #1.先判斷關鍵字是否輸入
  if len(key_word)!=0:
    #2.再判斷是否選擇瞭磁盤
    if len(select_path)==1:
      #3.判斷所選路徑是否存在
      if not os.path.exists(select_path):
        #4.判斷是否在列表框選擇瞭用戶名
        try:
          # 直接獲取選中項目
          """1.05獲取Listbox user_name_selected真費勁"""
          global user_name_selected
          user_name_selected=listb1.get(listb1.curselection())
          user_name_index = listb1.curselection()[0]
          user_id = user_id_list[user_name_index]
          container_id = '107603' + str(user_id)
          start_url = f'https://m.weibo.cn/api/container/getIndex?containerid={container_id}'
          spider = WeiBo_pics_Spider(start_url)
          t1.config(state='normal') # 將Text開啟，置為可讀可寫狀態
          l3.place(x=120, y=42)
          l3_var.set(f'正在運行......')
          l3['background'] = 'green'
          for pic_url in spider.get_pics_url():
            filename = pic_url.split('/')[-1]
            # 字符串切割，切割出前10個字符串
            filename = filename[10:]
            thread_it(spider.download_pics,pic_url,filename)

        #搜索後，但是沒選擇用戶，會報TclError錯誤，此except就用來捕獲這個異常
        except TclError:
          messagebox.showwarning(title='警告', message='請選擇一個用戶！')
          l3.place(x=105, y=42)
          l3_var.set(f'請選擇一個用戶！')
          l3['background'] = 'red'

        #獲取當前選中項目(使用索引)
      else:
        messagebox.showwarning(title='警告',message='請檢查路徑！')
        l3.place(x=80, y=42)
        l3_var.set(f'請檢查路徑！')
        l3['background'] = 'red'
    else:
      messagebox.showwarning(title='警告', message='您未選擇磁盤!')
      l3.place(x=85, y=42)
      l3_var.set(f'請檢查是否選擇瞭磁盤！')
      l3['background'] = 'red'
  else:
    messagebox.showwarning(title='警告', message='請輸入關鍵字！')
    l3.place(x=110, y=42)
    l3_var.set(f'請輸入關鍵字！')
    l3['background'] = 'red'

def open_disk():
  disk=c1.get()
  big_dir=disk+':/WeiBo_Pics'
  if len(disk)==1:
    try:
      if not os.path.exists(big_dir):
        os.mkdir(big_dir)
      os.startfile(big_dir)
    except:
      messagebox.showwarning(title='警告',message='選中的磁盤不存在！')
      l3.place(x=110, y=42)
      l3_var.set(f'選中的磁盤不存在！')
      l3['background'] = 'red'
  else:
    messagebox.showwarning(title='警告', message='您未選中磁盤！')
    l3.place(x=115, y=42)
    l3_var.set(f'您未選中磁盤！')
    l3['background'] = 'red'


def window_quit():
  ret=messagebox.askyesno(title='提示',message='是否要退出？')
  if ret==True:
    window.destroy()
    window.quit()


def e1_clear():
  e1.delete(0,END)

def print_path(event):
  #要使用完整的路徑
  global disk
  disk = c1.get()
  disk_path=c1.get()+':/'
  if len(disk)==1:
    if os.path.exists(disk_path):
      messagebox.showinfo(title='提示',message=f'文件將存儲到：{disk}:/WeiBo_Pics目錄下')
    else:
      messagebox.showerror(title='錯誤',message='選定磁盤不存在!')
      l3.place(x=100, y=42)
      l3_var.set(f'選中的磁盤不存在！')
      l3['background'] = 'red'
  else:
    messagebox.showwarning(title='警告', message='請先選定磁盤！')
    l3.place(x=120, y=42)
    l3_var.set(f'請先選定磁盤！')
    l3['background'] = 'red'

def switch():
  if r1_var.get()==0:
    r1_var.set(1)
  else:
    r1_var.set(0)


def escape(event):
  window_quit()

def enter(event):
  wb_search()

'''解決程序卡死的重要方法，避免子線程和Ui線程在同一個線程'''
def thread_it(func, *args):
  '''將函數打包進線程'''
  # 創建
  t = threading.Thread(target=func, args=args)
  # 守護 !!!
  t.setDaemon(True)
  # 啟動
  t.start()
  # 阻塞--卡死界面！
  # t.join()

window=Tk()
width=310
height=395
screenWidth = window.winfo_screenwidth() # 獲取顯示區域的寬度
screenHeight = window.winfo_screenheight() # 獲取顯示區域的高度
left = (screenWidth - width) / 2
top = (screenHeight - height) / 2
window.geometry("%dx%d+%d+%d" % (width, height, left, top))
window.resizable(0,0)
window.title('微博圖片采集工具-v1.08')
#設置圖標
ico_path=r'./rely/icon.ico'
window.iconbitmap(ico_path)
#插入圖片到Label中
photo = Image.open("./rely/w_b.png") # 括號裡為需要顯示在圖形化界面裡的圖片
photo = photo.resize((150, 40)) # 規定圖片大小
img0 = ImageTk.PhotoImage(photo)
l1=ttk.Label(window,imag=img0,justify='center')
l1.pack()

l3_var=StringVar()
l3=ttk.Label(window,background='yellow',textvar=l3_var)
l3.place(x=120,y=42)
l3_var.set('還沒搜索')


l1=ttk.Label(window,text='關鍵字或\n用戶id：')
l1.place(x=13,y=60)

e1=ttk.Entry(window,justify='center')
e1.place(x=80,y=65)


l4=ttk.Label(window,text='磁盤:')
l4.place(x=13,y=100)

disk_list=['C','D','E','F','G','H','I']
c1=ttk.Combobox(window,justify='center',state='readonly',width=17,value=disk_list)
#Combobox默認選中索引為0的項目 即 C盤
c1.bind('<<ComboboxSelected>>', print_path)
c1.place(x=80,y=100)


r1_var=IntVar()
r1_var.set(1)#默認選中為1
check1=Checkbutton(window,text='下載完\n打開文件夾',command=switch)
check1.place(x=223,y=90)


b1=ttk.Button(window,text='搜索',command=lambda:thread_it(wb_search),width=7)
b1.place(x=230,y=63)

l5=ttk.Label(window,text='用戶列表:')
l5.place(x=13,y=150)
lb1_var=StringVar()
listb1=Listbox(window,justify='center',listvariable=lb1_var,width=20,height=4)
listb1.place(x=80,y=135)

b2=ttk.Button(window,text='開始爬取',command=lambda :thread_it(wb_pics_parse,),width=7)
b2.place(x=230,y=160)

l6=ttk.Label(window,text='狀態：')
l6.place(x=13,y=280)

t1=Text(window,width=23,font=('times new roman',10),state='disable')
t1.place(x=80,y=230,height=140)

b3=ttk.Button(window,text=' 打開\n文件夾',width=7,command=open_disk)
b3.place(x=230,y=230)

b3=ttk.Button(window,text='退出',width=7,command=window_quit)
b3.place(x=230,y=315)


f1 = ttk.LabelFrame(window)
f1.place(x=65,y=350)
l6=ttk.Label(f1,text='敬告：本軟件僅供學習交流使用！',foreground='red')
l6.pack(anchor="w",fill=X)

#綁定esc鍵---退出
window.bind('<Escape>',escape)
#使用return鍵給輸入框Entry綁定enter事件---search搜索
e1.bind('<Return>',enter)

#加入主窗口銷毀事件
window.protocol('WM_DELETE_WINDOW',window_quit)
window.mainloop()

五．總結說明

本軟件僅供學習交流使用！圖源水印，在此僅作舉例！
由於這是第一次做GUI，因此遇到瞭一些問題，在此列舉一下：
1.窗口佈局問題（GUI基礎）
2.主窗口執行一個比較耗時操作導致卡死、崩潰（線程問題）。
3.主窗口關閉後，後臺線程還在運行（線程問題）。

以上問題已經全部解決，軟件切實可用。

另外,本軟件有四大亮點：

1.使用線程下載圖片
2.智能標簽提醒
3.輸入關鍵字直接敲回車能夠完成搜索
4.Esc快速退出軟件
軟件打包好瞭放在瞭藍奏雲https://wws.lanzous.com/iPSpzkchj5i

以上就是python制作微博圖片爬取工具的詳細內容，更多關於python 微博圖片爬取的資料請關註WalkonNet其它相關文章！