python爬取B站關註列表及數據庫的設計與操作

一、數據庫的設計與操作

1、數據的分析

B站的關註列表在

https://api.bilibili.com/x/relation/followings?vmid=UID&pn=1&ps=50&order=desc&order_type=attention

中,一頁最多50條信息。

我們大致分析一下信息,

{
	"code": 0,
	"message": "0",
	"ttl": 1,
	"data": {
		"list": [{……

首先,列表內容存在data:list裡。

其次,對於列表中每一項,有如下信息

			"mid": 672353429,
			"attribute": 2,
			"mtime": 1630510107,
			"tag": null,
			"special": 0,
			"contract_info": {
				"is_contractor": false,
				"ts": 0,
				"is_contract": false,
				"user_attr": 0
			},
			"uname": "貝拉kira",
			"face": "http://i2.hdslb.com/bfs/face/668af440f8a8065743d3fa79cfa8f017905d0065.jpg",
			"sign": "元氣滿滿的A-SOUL舞擔參上~目標TOP IDOL,一起加油!",
			"official_verify": {
				"type": 0,
				"desc": "虛擬偶像團體A-SOUL 所屬藝人"
			},
			"vip": {
				"vipType": 2,
				"vipDueDate": 1674576000000,
				"dueRemark": "",
				"accessStatus": 0,
				"vipStatus": 1,
				"vipStatusWarn": "",
				"themeType": 0,
				"label": {
					"path": "",
					"text": "年度大會員",
					"label_theme": "annual_vip",
					"text_color": "#FFFFFF",
					"bg_style": 1,
					"bg_color": "#FB7299",
					"border_color": ""
				},
				"avatar_subscript": 1,
				"nickname_color": "#FB7299",
				"avatar_subscript_url": "http://i0.hdslb.com/bfs/vip/icon_Certification_big_member_22_3x.png"
			}

其中,mid為用戶獨一無二的UID,vipType,0是什麼都沒開,1是大會員,2是年度大會員,official_verify中,type 0代表官方認證,-1代表沒有官方認證。

同時我們發現,如果對方鎖瞭列表,會返回

{"code":-400,"message":"請求錯誤","ttl":1}

2、數據庫設計

基於這些,我們先設計數據庫,包含兩張表,用戶信息的基本屬性表和關註的關系表。

def createDB():
    link=sqlite3.connect('BiliFollowDB.db')
    print("database open success")
    UserTableDDL='''
                create table if not exists user(
                UID int PRIMARY KEY     NOT NULL,
                NAME varchar            NOT NULL,
                SIGN varchar            DEFAULT NULL,
                vipType int             NOT NULL,
                verifyType int          NOT NULL,
                verifyDesc varchar      DEFAULT NULL)
                '''
    RelationTableDDL='''
                create table if not exists relation(
                follower int           NOT NULL,
                following int          NOT NULL,
                followTime int         NOT NULL,
                PRIMARY KEY (follower,following),
                FOREIGN KEY(follower,following) REFERENCES user(UID,UID)
                )
                '''
    # create user table
    link.execute(UserTableDDL)
    # create relation table
    link.execute(RelationTableDDL)
    print("database create success")
    link.commit()
    link.close()

3、數據庫操作

其次是插入新用戶的列表,我的思路是爬完一個人的關註列表,把一整個list丟給該函數,判斷是否存在新增用戶,存在則把新增用戶傳回,作為下一次爬蟲的起點。

def insertUser(infos):
    conn=sqlite3.connect('BiliFollowDB.db')
    link=conn.cursor()
    InsertCmd="insert into user (UID,NAME,vipType,verifyType,sign,verifyDesc) values (?,?,?,?,?,?);"
    ExistCmd="select count(UID) from user where UID='%d';"# % UID
    newID=[]
    for info in infos:
        answer=link.execute(ExistCmd%info['uid'])
        for row in answer:
            exist_ID=row[0]
        if exist_ID==0:
            newID.append(info['uid'])
            link.execute(InsertCmd,(info['uid'],info['name'],info['vipType'],info['verifyType'],info['sign'],info['verifyDesc']))
    conn.commit()
    conn.close()
    return newID

然後是插入關系的函數,這個比較簡單

def insertFollowing(uid:int,subscribe):
    conn=sqlite3.connect('BiliFollowDB.db')
    link=conn.cursor()
    InsertCmd="insert into relation (follower,following,followTime) values (?,?,?);"
    for follow in subscribe:
        link.execute(InsertCmd,(uid,follow[0],follow[1]))
    conn.commit()
    conn.close()
 

二、爬蟲

通過觀察,我們發現睿叔叔鎖瞭5頁的關註列表

即使是人工操作也隻能訪問5頁,那沒辦法啦,我們就爬5頁吧。

def getFollowingList(uid:int):
    url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"# % (UID, Page Number)
    infos=[]
    subscribe=[]
    for i in range(1,6):
        html=requests.get(url%(uid,i))
        if html.status_code!=200:
            print("GET ERROR!")
        text=html.text
        dic=json.loads(text)
        if dic['code']==-400:
            break
        list=dic['data']['list']
        for usr in list:
            info={}
            info['uid']=usr['mid']
            info['name']=usr['uname']
            info['vipType']=usr['vip']['vipType']
            info['verifyType']=usr['official_verify']['type']
            info['sign']=usr['sign']
            if info['verifyType']==-1:
                info['verifyDesc']='NULL'
            else :
                info['verifyDesc']=usr['official_verify']['desc']
            subscribe.append((usr['mid'],usr['mtime']))
            infos.append(info)
    newID=insertUser(infos)
    insertFollowing(uid,subscribe)
    return newID

三、完整代碼

#by concyclics
# -*- coding:UTF-8 -*-
import sqlite3
import json
import requests
def createDB():
    link=sqlite3.connect('BiliFollowDB.db')
    print("database open success")
    UserTableDDL='''
                create table if not exists user(
                UID int PRIMARY KEY     NOT NULL,
                NAME varchar            NOT NULL,
                SIGN varchar            DEFAULT NULL,
                vipType int             NOT NULL,
                verifyType int          NOT NULL,
                verifyDesc varchar      DEFAULT NULL)
                '''
    RelationTableDDL='''
                create table if not exists relation(
                follower int           NOT NULL,
                following int          NOT NULL,
                followTime int         NOT NULL,
                PRIMARY KEY (follower,following),
                FOREIGN KEY(follower,following) REFERENCES user(UID,UID)
                )
                '''
    # create user table
    link.execute(UserTableDDL)
    # create relation table
    link.execute(RelationTableDDL)
    print("database create success")
    link.commit()
    link.close()
def insertUser(infos):
    conn=sqlite3.connect('BiliFollowDB.db')
    link=conn.cursor()
    InsertCmd="insert into user (UID,NAME,vipType,verifyType,sign,verifyDesc) values (?,?,?,?,?,?);"
    ExistCmd="select count(UID) from user where UID='%d';"# % UID
    newID=[]
    for info in infos:
        answer=link.execute(ExistCmd%info['uid'])
        for row in answer:
            exist_ID=row[0]
        if exist_ID==0:
            newID.append(info['uid'])
            link.execute(InsertCmd,(info['uid'],info['name'],info['vipType'],info['verifyType'],info['sign'],info['verifyDesc']))
    conn.commit()
    conn.close()
    return newID
def insertFollowing(uid:int,subscribe):
    conn=sqlite3.connect('BiliFollowDB.db')
    link=conn.cursor()
    InsertCmd="insert into relation (follower,following,followTime) values (?,?,?);"
    for follow in subscribe:
        try:
            link.execute(InsertCmd,(uid,follow[0],follow[1]))
        except:
            print((uid,follow[0],follow[1]))
    conn.commit()
    conn.close()
def getFollowingList(uid:int):
    url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"# % (UID, Page Number)
    infos=[]
    subscribe=[]
    for i in range(1,6):
        html=requests.get(url%(uid,i))
        if html.status_code!=200:
            print("GET ERROR!")
            return []
        text=html.text
        dic=json.loads(text)
        if dic['code']==-400:
            return []
        try:
            list=dic['data']['list']
        except:
            return []
        for usr in list:
            info={}
            info['uid']=usr['mid']
            info['name']=usr['uname']
            info['vipType']=usr['vip']['vipType']
            info['verifyType']=usr['official_verify']['type']
            info['sign']=usr['sign']
            if info['verifyType']==-1:
                info['verifyDesc']='NULL'
            else :
                info['verifyDesc']=usr['official_verify']['desc']
            subscribe.append((usr['mid'],usr['mtime']))
            infos.append(info)
    newID=insertUser(infos)
    insertFollowing(uid,subscribe)
    return newID
def getFollowingUid(uid:int):
    url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"# % (UID, Page Number)
    for i in range(1,6):
        html=requests.get(url%(uid,i))
        if html.status_code!=200:
            print("GET ERROR!")
            return []
        text=html.text
        dic=json.loads(text)
        if dic['code']==-400:
            return []
        try:
            list=dic['data']['list']
        except:
            return []
        IDs=[]
        for usr in list:
            IDs.append(usr['mid'])
        return IDs
def work(root):
    IDlist=root
    tmplist=[]
    while len(IDlist)!=0:
        tmplist=[]
        for ID in IDlist:
            print(ID)
            tmplist+=getFollowingList(ID)
        IDlist=tmplist
def rework():
    conn=sqlite3.connect('BiliFollowDB.db')
    link=conn.cursor()
    SelectCmd="select uid from user;"
    answer=link.execute(SelectCmd)
    IDs=[]
    for row in answer:
        IDs.append(row[0])
    conn.commit()
    conn.close()
    newID=[]
    print(IDs)
    for ID in IDs:
        ids=getFollowingUid(ID)
        for id in ids:
            if id not in IDs:
                newID.append(id)
    return newID
if __name__=="__main__":
    createDB()
    #work([**put root UID here**,])

四、項目倉庫

https://github.com/Concyclics/BiliBiliFollowSpider

以上就是python爬取B站關註列表及數據庫的設計與操作的詳細內容,更多關於python爬取B站關註列表的資料請關註WalkonNet其它相關文章!

推薦閱讀: