Python數據分析之Python和Selenium爬取BOSS直聘崗位
一、數據爬取的代碼
#encoding='utf-8' from selenium import webdriver import time import re import pandas as pd import os def close_windows(): #如果有登錄彈窗,就關閉 try: time.sleep(0.5) if dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon"): dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon").click() except BaseException as e: print('close_windows,沒有彈窗',e) def get_current_region_job(k_index): flag = 0 # page_num_set=0#每區獲取多少條數據,對30取整 df_empty = pd.DataFrame(columns=['崗位', '地點', '薪資', '工作經驗', '學歷', '公司', '技能']) while (flag == 0): # while (page_num_set<151)&(flag == 0):#每次隻能獲取150條信息 time.sleep(0.5) close_windows() job_list = dr.find_elements_by_class_name("job-primary") for job in job_list:#獲取當前頁的職位30條 job_name = job.find_element_by_class_name("job-name").text # print(job_name) job_area = job.find_element_by_class_name("job-area").text salary = job.find_element_by_class_name("red").get_attribute("textContent") # 獲取薪資 # salary_raw = job.find_element_by_class_name("red").get_attribute("textContent") # 獲取薪資 # salary_split = salary_raw.split('·') # 根據·分割 # salary = salary_split[0] # 隻取薪資,去掉多少薪 # if re.search(r'天', salary): # continue experience_education = job.find_element_by_class_name("job-limit").find_element_by_tag_name( "p").get_attribute("innerHTML") # experience_education_raw = '1-3年<em class="vline"></em>本科' experience_education_raw = experience_education split_str = re.search(r'[a-zA-Z =<>/"]{23}', experience_education_raw) # 搜索分割字符串<em class="vline"></em> # print(split_str) experience_education_replace = re.sub(r'[a-zA-Z =<>/"]{23}', ",", experience_education_raw) # 分割字符串替換為逗號 # print(experience_education_replace) experience_education_list = experience_education_replace.split(',') # 根據逗號分割 # print('experience_education_list:',experience_education_list) if len(experience_education_list)!=2: print('experience_education_list不是2個,跳過該數據',experience_education_list) break experience = experience_education_list[0] education = experience_education_list[1] # print(experience) # print(education) company = job.find_element_by_class_name("company-text").find_element_by_class_name("name").text skill_list = job.find_element_by_class_name("tags").find_elements_by_class_name("tag-item") skill = [] for skill_i in skill_list: skill_i_text = skill_i.text if len(skill_i_text) == 0: continue skill.append(skill_i_text) # print(job_name) # print(skill) df_empty.loc[k_index, :] = [job_name, job_area, salary, experience, education, company, skill] k_index = k_index + 1 # page_num_set=page_num_set+1 print("已經讀取數據{}條".format(k_index)) close_windows() try:#點擊下一頁 cur_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text # print('cur_page_num',cur_page_num) #點擊下一頁 element = dr.find_element_by_class_name("page").find_element_by_class_name("next") dr.execute_script("arguments[0].click();", element) time.sleep(1) # print('點擊下一頁') new_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text # print('new_page_num',new_page_num) if cur_page_num==new_page_num: flag = 1 break except BaseException as e: print('點擊下一頁錯誤',e) break print(df_empty) if os.path.exists("數據.csv"):#存在追加,不存在創建 df_empty.to_csv('數據.csv', mode='a', header=False, index=None, encoding='gb18030') else: df_empty.to_csv("數據.csv", index=False, encoding='gb18030') return k_index def main(): # 打開瀏覽器 # dr = webdriver.Firefox() global dr dr = webdriver.Chrome() # dr = webdriver.Ie() # # 後臺打開瀏覽器 # option=webdriver.ChromeOptions() # option.add_argument('headless') # dr = webdriver.Chrome(chrome_options=option) # print("打開瀏覽器") # 將瀏覽器最大化顯示 dr.maximize_window() # 轉到目標網址 # dr.get("https://www.zhipin.com/job_detail/?query=Python&city=100010000&industry=&position=")#全國 dr.get("https://www.zhipin.com/c101010100/?query=Python&ka=sel-city-101010100")#北京 print("打開網址") time.sleep(5) k_index = 0#數據條數、DataFrame索引 flag_hot_city=0 for i in range(3,17,1): # print('第',i-2,'頁') # try: # 獲取城市 close_windows() hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") close_windows() # hot_city_list[i].click()#防止彈窗,改為下面兩句 # element_hot_city_list_first = hot_city_list[i] dr.execute_script("arguments[0].click();", hot_city_list[i]) # 輸出城市名 close_windows() hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") print('城市:{}'.format(i-2),hot_city_list[i].text) time.sleep(0.5) # 獲取區縣 for j in range(1,50,1): # print('第', j , '個區域') # try: # close_windows() # hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") # 在這個for循環點一下城市,不然識別不到當前頁面已經更新瞭 close_windows() hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") close_windows() # hot_city_list[i].click()#防止彈窗,改為下面 dr.execute_script("arguments[0].click();", hot_city_list[i]) #輸出區縣名稱 close_windows() city_district = dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a") if len(city_district)==j: print('遍歷完所有區縣,沒有不可點擊的,跳轉下一個城市') break print('區縣:',j, city_district[j].text) # city_district_value=city_district[j].text#當前頁面的區縣值 # 點擊區縣 close_windows() city_district= dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a") close_windows() # city_district[j].click()]#防止彈窗,改為下面兩句 # element_city_district = city_district[j] dr.execute_script("arguments[0].click();", city_district[j]) #判斷區縣是不是點完瞭 close_windows() hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") print('點擊後這裡應該是區縣', hot_city_list[1].text)#如果是不限,說明點完瞭,跳出 hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") print('如果點完瞭,這裡應該是不限:',hot_city_list[1].text) hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") if hot_city_list[1].text == '不限': print('當前區縣已經點完瞭,點擊下一個城市') flag_hot_city=1 break close_windows() k_index = get_current_region_job(k_index)#獲取職位,爬取數據 # 重新點回城市頁面,再次獲取區縣。但此時多瞭區縣,所以i+1 close_windows() hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a") close_windows() # hot_city_list[i+1].click()#防止彈窗,改為下面兩句 # element_hot_city_list_again = hot_city_list[i+1] dr.execute_script("arguments[0].click();", hot_city_list[i+1]) # except BaseException as e: # print('main的j循環-獲取區縣發生錯誤:', e) # close_windows() time.sleep(0.5) # except BaseException as e: # print('main的i循環發生錯誤:',e) # close_windows() time.sleep(0.5) # 退出瀏覽器 dr.quit() # p1.close() if __name__ == '__main__': main()
二、獲取到的數據如圖所示
三、數據分析的代碼
# coding=utf-8 import collections import wordcloud import re import pandas as pd import numpy as np import os import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 顯示中文標簽 plt.rcParams['axes.unicode_minus'] = False # 設置正常顯示符號 def create_dir_not_exist(path): # 判斷文件夾是否存在,不存在-新建 if not os.path.exists(path): os.mkdir(path) create_dir_not_exist(r'./image') create_dir_not_exist(r'./image/city') data = pd.read_csv('數據.csv', encoding='gb18030') data_df = pd.DataFrame(data) print("\n查看是否有缺失值\n", data_df.isnull().sum()) data_df_del_empty = data_df.dropna(subset=['崗位'], axis=0) # print("\n刪除缺失值‘崗位'的整行\n",data_df_del_empty) data_df_del_empty = data_df_del_empty.dropna(subset=['公司'], axis=0) # print("\n刪除缺失值‘公司'的整行\n",data_df_del_empty) print("\n查看是否有缺失值\n", data_df_del_empty.isnull().sum()) print('去除缺失值後\n', data_df_del_empty) data_df_python_keyword = data_df_del_empty.loc[data_df_del_empty['崗位'].str.contains('Python|python')] # print(data_df_python_keyword)#篩選帶有python的行 # 區間最小薪資 data_df_python_keyword_salary = data_df_python_keyword['薪資'].str.split('-', expand=True)[0] print(data_df_python_keyword_salary) # 區間最小薪資 # Dataframe新增一列 在第 列新增一列名為' ' 的一列 數據 data_df_python_keyword.insert(7, '區間最小薪資(K)', data_df_python_keyword_salary) print(data_df_python_keyword) # 城市地區 data_df_python_keyword_location_city = data_df_python_keyword['地點'].str.split('·', expand=True)[0] print(data_df_python_keyword_location_city) # 北京 data_df_python_keyword_location_district = data_df_python_keyword['地點'].str.split('·', expand=True)[1] print(data_df_python_keyword_location_district) # 海淀區 data_df_python_keyword_location_city_district = [] for city, district in zip(data_df_python_keyword_location_city, data_df_python_keyword_location_district): city_district = city + district data_df_python_keyword_location_city_district.append(city_district) print(data_df_python_keyword_location_city_district) # 北京海淀區 # Dataframe新增一列 在第 列新增一列名為' ' 的一列 數據 data_df_python_keyword.insert(8, '城市地區', data_df_python_keyword_location_city_district) print(data_df_python_keyword) data_df_python_keyword.insert(9, '城市', data_df_python_keyword_location_city) data_df_python_keyword.insert(10, '地區', data_df_python_keyword_location_district) data_df_python_keyword.to_csv("data_df_python_keyword.csv", index=False, encoding='gb18030') print('-------------------------------------------') def draw_bar(row_lable, title): figsize_x = 10 figsize_y = 6 global list1_education, list2_education, df1, df2 plt.figure(figsize=(figsize_x, figsize_y)) list1_education = [] list2_education = [] for df1, df2 in data_df_python_keyword.groupby(row_lable): list1_education.append(df1) list2_education.append(len(df2)) # print(list1_education) # print(list2_education) # 利用 * 解包方式 將 一個排序好的元組,通過元組生成器再轉成list # print(*sorted(zip(list2_education,list1_education))) # print(sorted(zip(list2_education,list1_education))) # 排序,兩個列表對應原始排序,按第幾個列表排序,註意先後位置 list2_education, list1_education = (list(t) for t in zip(*sorted(zip(list2_education, list1_education)))) plt.bar(list1_education, list2_education) plt.title('{}'.format(title)) plt.savefig('./image/{}分析.jpg'.format(title)) # plt.show() plt.close() # 學歷 draw_bar('學歷', '學歷') draw_bar('工作經驗', '工作經驗') draw_bar('區間最小薪資(K)', '14個熱門城市的薪資分佈情況(K)') # ----------------------------------------- # 根據城市地區求均值 list_group_city1 = [] list_group_city2 = [] for df1, df2 in data_df_python_keyword.groupby(data_df_python_keyword['城市地區']): # print(df1) # print(df2) list_group_city1.append(df1) salary_list_district = [int(i) for i in (df2['區間最小薪資(K)'].values.tolist())] district_salary_mean = round(np.mean(salary_list_district), 2) # 每個區縣的平均薪資 round(a, 2)保留2位小數 list_group_city2.append(district_salary_mean) list_group_city2, list_group_city1 = (list(t) for t in zip(*sorted(zip(list_group_city2, list_group_city1), reverse=False))) # # print(list_group_city1) # print(list_group_city2) plt.figure(figsize=(10, 50)) plt.barh(list_group_city1, list_group_city2) # 坐標軸上的文字說明 for ax, ay in zip(list_group_city1, list_group_city2): # 設置文字說明 第一、二個參數:坐標軸上的值; 第三個參數:說明文字;ha:垂直對齊方式;va:水平對齊方式 plt.text(ay, ax, '%.2f' % ay, ha='center', va='bottom') plt.title('14個熱門城市的各區縣招聘工資情況(K)') plt.savefig('./image/14個熱門城市的各區縣招聘工資情況(K).jpg') # plt.show() plt.close() # ----------------------------------------- # 根據城市分組排序, list_group_city11 = [] list_group_city22 = [] list_group_city33 = [] list_group_city44 = [] for df_city1, df_city2 in data_df_python_keyword.groupby(data_df_python_keyword['城市']): # print(df_city1)#市 # print(df_city2) list_group_district2 = [] # 區縣列表 district_mean_salary2 = [] # 工資均值列表 for df_district1, df_district2 in df_city2.groupby(data_df_python_keyword['地區']): # print(df_district1)#區縣 # print(df_district2)#工作 list_group_district2.append(df_district1) # 記錄區縣 salary_list_district2 = [int(i) for i in (df_district2['區間最小薪資(K)'].values.tolist())] # 工資列表 district_salary_mean2 = round(np.mean(salary_list_district2), 2) # 每個區縣的平均薪資 round(a, 2)保留2位小數 district_mean_salary2.append(district_salary_mean2) # 記錄區縣的平均工作的列表 district_mean_salary2, list_group_district2 = (list(tt) for tt in zip( *sorted(zip(district_mean_salary2, list_group_district2), reverse=True))) plt.figure(figsize=(10, 6)) plt.bar(list_group_district2, district_mean_salary2) # 坐標軸上的文字說明 for ax, ay in zip(list_group_district2, district_mean_salary2): # 設置文字說明 第一、二個參數:坐標軸上的值; 第三個參數:說明文字;ha:垂直對齊方式;va:水平對齊方式 plt.text(ax, ay, '%.2f' % ay, ha='center', va='bottom') plt.title('14個熱門城市的各區縣招聘工資情況_{}(K)'.format(df_city1)) plt.savefig('./image/city/14個熱門城市的各區縣招聘工資情況_{}(K).jpg'.format(df_city1)) # plt.show() plt.close() # ---------------------------------------------------- skill_all = data_df_python_keyword['技能'] print(skill_all) skill_list = [] for i in skill_all: # print(type(i)) print(i) # print(i.split(", | ' | \[ | \] | \" | ")) result = re.split(r'[,\' \[, \] ]', i) print(result) # if type(i) == list: skill_list = skill_list + result print('++++++++++++++++++++++++++++++++') # print(skill_list) list_new = skill_list # 詞頻統計 word_counts = collections.Counter(list_new) # 對分詞做詞頻統計 word_counts_top10 = word_counts.most_common(30) # 獲取前10最高頻的詞 # print (word_counts_top10) # 輸出檢查 # print (word_counts_top10[0][0]) # 輸出檢查 # 生成柱狀圖 list_x = [] list_y = [] for i in word_counts_top10: list_x.append(i[0]) list_y.append(i[1]) print('list_x', list_x[1:]) print('list_y', list_y[1:]) plt.figure(figsize=(30, 5)) plt.bar(list_x[1:], list_y[1:]) plt.savefig('./image/技能棧_詞頻_柱狀圖.png') # plt.show() plt.close() list_new = " ".join(list_new) # 列表轉字符串,以空格間隔 # print(list_new) wc = wordcloud.WordCloud( width=800, height=600, background_color="#ffffff", # 設置背景顏色 max_words=50, # 詞的最大數(默認為200) max_font_size=60, # 最大字體尺寸 min_font_size=10, # 最小字體尺寸(默認為4) # colormap='bone', # string or matplotlib colormap, default="viridis" colormap='hsv', # string or matplotlib colormap, default="viridis" random_state=20, # 設置有多少種隨機生成狀態,即有多少種配色方案 # mask=plt.imread("mask2.gif"), # 讀取遮罩圖片!! font_path='simhei.ttf' ) my_wordcloud = wc.generate(list_new) plt.imshow(my_wordcloud) plt.axis("off") # plt.show() wc.to_file('./image/技能棧_詞雲.png') # 保存圖片文件 plt.close()
四、學歷分析
五、工作經驗分析
六、14個熱門城市的各區縣招聘薪資情況
七、各城市各區縣的薪資情況
北京
上海
其餘12個城市不再展示,生成代碼都一樣
八、技能棧
到此這篇關於Python數據分析之Python和Selenium爬取BOSS直聘崗位的文章就介紹到這瞭,更多相關Python和Selenium爬取BOSS直聘內容請搜索WalkonNet以前的文章或繼續瀏覽下面的相關文章希望大傢以後多多支持WalkonNet!
推薦閱讀:
- 利用Python制作一個簡單的天氣播報系統
- python+selenium對table表和分頁處理
- 使用微信小程序制作核酸檢測點查詢工具
- 基於Python實現火車票搶票軟件
- python爬蟲利用selenium實現自動翻頁爬取某魚數據的思路詳解