很多小伙伴在求職的時(shí)候沒(méi)有辦法在短時(shí)間內(nèi)看完很多的職位信息數(shù)據(jù),可能就會(huì)因此錯(cuò)過(guò)一些好的崗位。今天小編帶來(lái)一個(gè)python爬蟲(chóng)實(shí)戰(zhàn)項(xiàng)目(附帶數(shù)據(jù)分析)是有關(guān)于招聘崗位數(shù)據(jù)爬取的,那么接下來(lái)就讓我們來(lái)看看python怎么爬取招聘崗位數(shù)據(jù)吧。
本篇文章URL已作了特別處理,所以代碼不能直接使用。爬蟲(chóng)學(xué)習(xí)的是思路而不是代碼的復(fù)制,希望小伙伴們能自行根據(jù)思路寫(xiě)出自己的爬蟲(chóng)代碼!
另外:惡意爬取別人的網(wǎng)站數(shù)據(jù)是違法行為,在學(xué)習(xí)的過(guò)程中請(qǐng)注意爬取力度。
一、數(shù)據(jù)爬取的代碼
#encoding='utf-8'
from selenium import webdriver
import time
import re
import pandas as pd
import os
def close_windows():
#如果有登錄彈窗,就關(guān)閉
try:
time.sleep(0.5)
if dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon"):
dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon").click()
except BaseException as e:
print('close_windows,沒(méi)有彈窗',e)
def get_current_region_job(k_index):
flag = 0
# page_num_set=0#每區(qū)獲取多少條數(shù)據(jù),對(duì)30取整
df_empty = pd.DataFrame(columns=['崗位', '地點(diǎn)', '薪資', '工作經(jīng)驗(yàn)', '學(xué)歷', '公司', '技能'])
while (flag == 0):
# while (page_num_set<151)&(flag == 0):#每次只能獲取150條信息
time.sleep(0.5)
close_windows()
job_list = dr.find_elements_by_class_name("job-primary")
for job in job_list:#獲取當(dāng)前頁(yè)的職位30條
job_name = job.find_element_by_class_name("job-name").text
# print(job_name)
job_area = job.find_element_by_class_name("job-area").text
salary = job.find_element_by_class_name("red").get_attribute("textContent") # 獲取薪資
# salary_raw = job.find_element_by_class_name("red").get_attribute("textContent") # 獲取薪資
# salary_split = salary_raw.split('·') # 根據(jù)·分割
# salary = salary_split[0] # 只取薪資,去掉多少薪
# if re.search(r'天', salary):
# continue
experience_education = job.find_element_by_class_name("job-limit").find_element_by_tag_name(
"p").get_attribute("innerHTML")
# experience_education_raw = '1-3年<em class="vline"></em>本科'
experience_education_raw = experience_education
split_str = re.search(r'[a-zA-Z =<>/"]{23}', experience_education_raw) # 搜索分割字符串<em class="vline"></em>
# print(split_str)
experience_education_replace = re.sub(r'[a-zA-Z =<>/"]{23}', ",", experience_education_raw) # 分割字符串替換為逗號(hào)
# print(experience_education_replace)
experience_education_list = experience_education_replace.split(',') # 根據(jù)逗號(hào)分割
# print('experience_education_list:',experience_education_list)
if len(experience_education_list)!=2:
print('experience_education_list不是2個(gè),跳過(guò)該數(shù)據(jù)',experience_education_list)
break
experience = experience_education_list[0]
education = experience_education_list[1]
# print(experience)
# print(education)
company = job.find_element_by_class_name("company-text").find_element_by_class_name("name").text
skill_list = job.find_element_by_class_name("tags").find_elements_by_class_name("tag-item")
skill = []
for skill_i in skill_list:
skill_i_text = skill_i.text
if len(skill_i_text) == 0:
continue
skill.append(skill_i_text)
# print(job_name)
# print(skill)
df_empty.loc[k_index, :] = [job_name, job_area, salary, experience, education, company, skill]
k_index = k_index + 1
# page_num_set=page_num_set+1
print("已經(jīng)讀取數(shù)據(jù){}條".format(k_index))
close_windows()
try:#點(diǎn)擊下一頁(yè)
cur_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text
# print('cur_page_num',cur_page_num)
#點(diǎn)擊下一頁(yè)
element = dr.find_element_by_class_name("page").find_element_by_class_name("next")
dr.execute_script("arguments[0].click();", element)
time.sleep(1)
# print('點(diǎn)擊下一頁(yè)')
new_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text
# print('new_page_num',new_page_num)
if cur_page_num==new_page_num:
flag = 1
break
except BaseException as e:
print('點(diǎn)擊下一頁(yè)錯(cuò)誤',e)
break
print(df_empty)
if os.path.exists("數(shù)據(jù).csv"):#存在追加,不存在創(chuàng)建
df_empty.to_csv('數(shù)據(jù).csv', mode='a', header=False, index=None, encoding='gb18030')
else:
df_empty.to_csv("數(shù)據(jù).csv", index=False, encoding='gb18030')
return k_index
def main():
# 打開(kāi)瀏覽器
# dr = webdriver.Firefox()
global dr
dr = webdriver.Chrome()
# dr = webdriver.Ie()
# # 后臺(tái)打開(kāi)瀏覽器
# option=webdriver.ChromeOptions()
# option.add_argument('headless')
# dr = webdriver.Chrome(chrome_options=option)
# print("打開(kāi)瀏覽器")
# 將瀏覽器最大化顯示
dr.maximize_window()
# 轉(zhuǎn)到目標(biāo)網(wǎng)址
# dr.get("https://www.******.com/job_detail/?query=Python&city=100010000&industry=&position=")#全國(guó)
dr.get("https://www.******.com/c101010100/?query=Python&ka=sel-city-101010100")#北京
print("打開(kāi)網(wǎng)址")
time.sleep(5)
k_index = 0#數(shù)據(jù)條數(shù)、DataFrame索引
flag_hot_city=0
for i in range(3,17,1):
# print('第',i-2,'頁(yè)')
# try:
# 獲取城市
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
close_windows()
# hot_city_list[i].click()#防止彈窗,改為下面兩句
# element_hot_city_list_first = hot_city_list[i]
dr.execute_script("arguments[0].click();", hot_city_list[i])
# 輸出城市名
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
print('城市:{}'.format(i-2),hot_city_list[i].text)
time.sleep(0.5)
# 獲取區(qū)縣
for j in range(1,50,1):
# print('第', j , '個(gè)區(qū)域')
# try:
# close_windows()
# hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
# 在這個(gè)for循環(huán)點(diǎn)一下城市,不然識(shí)別不到當(dāng)前頁(yè)面已經(jīng)更新了
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
close_windows()
# hot_city_list[i].click()#防止彈窗,改為下面
dr.execute_script("arguments[0].click();", hot_city_list[i])
#輸出區(qū)縣名稱
close_windows()
city_district = dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a")
if len(city_district)==j:
print('遍歷完所有區(qū)縣,沒(méi)有不可點(diǎn)擊的,跳轉(zhuǎn)下一個(gè)城市')
break
print('區(qū)縣:',j, city_district[j].text)
# city_district_value=city_district[j].text#當(dāng)前頁(yè)面的區(qū)縣值
# 點(diǎn)擊區(qū)縣
close_windows()
city_district= dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a")
close_windows()
# city_district[j].click()]#防止彈窗,改為下面兩句
# element_city_district = city_district[j]
dr.execute_script("arguments[0].click();", city_district[j])
#判斷區(qū)縣是不是點(diǎn)完了
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
print('點(diǎn)擊后這里應(yīng)該是區(qū)縣', hot_city_list[1].text)#如果是不限,說(shuō)明點(diǎn)完了,跳出
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
print('如果點(diǎn)完了,這里應(yīng)該是不限:',hot_city_list[1].text)
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
if hot_city_list[1].text == '不限':
print('當(dāng)前區(qū)縣已經(jīng)點(diǎn)完了,點(diǎn)擊下一個(gè)城市')
flag_hot_city=1
break
close_windows()
k_index = get_current_region_job(k_index)#獲取職位,爬取數(shù)據(jù)
# 重新點(diǎn)回城市頁(yè)面,再次獲取區(qū)縣。但此時(shí)多了區(qū)縣,所以i+1
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
close_windows()
# hot_city_list[i+1].click()#防止彈窗,改為下面兩句
# element_hot_city_list_again = hot_city_list[i+1]
dr.execute_script("arguments[0].click();", hot_city_list[i+1])
# except BaseException as e:
# print('main的j循環(huán)-獲取區(qū)縣發(fā)生錯(cuò)誤:', e)
# close_windows()
time.sleep(0.5)
# except BaseException as e:
# print('main的i循環(huán)發(fā)生錯(cuò)誤:',e)
# close_windows()
time.sleep(0.5)
# 退出瀏覽器
dr.quit()
# p1.close()
if __name__ == '__main__':
main()
二、獲取到的數(shù)據(jù)如圖所示
三、數(shù)據(jù)分析的代碼
# coding=utf-8
import collections
import wordcloud
import re
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 顯示中文標(biāo)簽
plt.rcParams['axes.unicode_minus'] = False # 設(shè)置正常顯示符號(hào)
def create_dir_not_exist(path): # 判斷文件夾是否存在,不存在-新建
if not os.path.exists(path):
os.mkdir(path)
create_dir_not_exist(r'./image')
create_dir_not_exist(r'./image/city')
data = pd.read_csv('數(shù)據(jù).csv', encoding='gb18030')
data_df = pd.DataFrame(data)
print("
查看是否有缺失值
", data_df.isnull().sum())
data_df_del_empty = data_df.dropna(subset=['崗位'], axis=0)
# print("
刪除缺失值‘崗位'的整行
",data_df_del_empty)
data_df_del_empty = data_df_del_empty.dropna(subset=['公司'], axis=0)
# print("
刪除缺失值‘公司'的整行
",data_df_del_empty)
print("
查看是否有缺失值
", data_df_del_empty.isnull().sum())
print('去除缺失值后
', data_df_del_empty)
data_df_python_keyword = data_df_del_empty.loc[data_df_del_empty['崗位'].str.contains('Python|python')]
# print(data_df_python_keyword)#篩選帶有python的行
# 區(qū)間最小薪資
data_df_python_keyword_salary = data_df_python_keyword['薪資'].str.split('-', expand=True)[0]
print(data_df_python_keyword_salary) # 區(qū)間最小薪資
# Dataframe新增一列 在第 列新增一列名為' ' 的一列 數(shù)據(jù)
data_df_python_keyword.insert(7, '區(qū)間最小薪資(K)', data_df_python_keyword_salary)
print(data_df_python_keyword)
# 城市地區(qū)
data_df_python_keyword_location_city = data_df_python_keyword['地點(diǎn)'].str.split('·', expand=True)[0]
print(data_df_python_keyword_location_city) # 北京
data_df_python_keyword_location_district = data_df_python_keyword['地點(diǎn)'].str.split('·', expand=True)[1]
print(data_df_python_keyword_location_district) # 海淀區(qū)
data_df_python_keyword_location_city_district = []
for city, district in zip(data_df_python_keyword_location_city, data_df_python_keyword_location_district):
city_district = city + district
data_df_python_keyword_location_city_district.append(city_district)
print(data_df_python_keyword_location_city_district) # 北京海淀區(qū)
# Dataframe新增一列 在第 列新增一列名為' ' 的一列 數(shù)據(jù)
data_df_python_keyword.insert(8, '城市地區(qū)', data_df_python_keyword_location_city_district)
print(data_df_python_keyword)
data_df_python_keyword.insert(9, '城市', data_df_python_keyword_location_city)
data_df_python_keyword.insert(10, '地區(qū)', data_df_python_keyword_location_district)
data_df_python_keyword.to_csv("data_df_python_keyword.csv", index=False, encoding='gb18030')
print('-------------------------------------------')
def draw_bar(row_lable, title):
figsize_x = 10
figsize_y = 6
global list1_education, list2_education, df1, df2
plt.figure(figsize=(figsize_x, figsize_y))
list1_education = []
list2_education = []
for df1, df2 in data_df_python_keyword.groupby(row_lable):
list1_education.append(df1)
list2_education.append(len(df2))
# print(list1_education)
# print(list2_education)
# 利用 * 解包方式 將 一個(gè)排序好的元組,通過(guò)元組生成器再轉(zhuǎn)成list
# print(*sorted(zip(list2_education,list1_education)))
# print(sorted(zip(list2_education,list1_education)))
# 排序,兩個(gè)列表對(duì)應(yīng)原始排序,按第幾個(gè)列表排序,注意先后位置
list2_education, list1_education = (list(t) for t in zip(*sorted(zip(list2_education, list1_education))))
plt.bar(list1_education, list2_education)
plt.title('{}'.format(title))
plt.savefig('./image/{}分析.jpg'.format(title))
# plt.show()
plt.close()
# 學(xué)歷
draw_bar('學(xué)歷', '學(xué)歷')
draw_bar('工作經(jīng)驗(yàn)', '工作經(jīng)驗(yàn)')
draw_bar('區(qū)間最小薪資(K)', '14個(gè)熱門(mén)城市的薪資分布情況(K)')
# -----------------------------------------
# 根據(jù)城市地區(qū)求均值
list_group_city1 = []
list_group_city2 = []
for df1, df2 in data_df_python_keyword.groupby(data_df_python_keyword['城市地區(qū)']):
# print(df1)
# print(df2)
list_group_city1.append(df1)
salary_list_district = [int(i) for i in (df2['區(qū)間最小薪資(K)'].values.tolist())]
district_salary_mean = round(np.mean(salary_list_district), 2) # 每個(gè)區(qū)縣的平均薪資 round(a, 2)保留2位小數(shù)
list_group_city2.append(district_salary_mean)
list_group_city2, list_group_city1 = (list(t) for t in
zip(*sorted(zip(list_group_city2, list_group_city1), reverse=False)))
#
# print(list_group_city1)
# print(list_group_city2)
plt.figure(figsize=(10, 50))
plt.barh(list_group_city1, list_group_city2)
# 坐標(biāo)軸上的文字說(shuō)明
for ax, ay in zip(list_group_city1, list_group_city2):
# 設(shè)置文字說(shuō)明 第一、二個(gè)參數(shù):坐標(biāo)軸上的值; 第三個(gè)參數(shù):說(shuō)明文字;ha:垂直對(duì)齊方式;va:水平對(duì)齊方式
plt.text(ay, ax, '%.2f' % ay, ha='center', va='bottom')
plt.title('14個(gè)熱門(mén)城市的各區(qū)縣招聘工資情況(K)')
plt.savefig('./image/14個(gè)熱門(mén)城市的各區(qū)縣招聘工資情況(K).jpg')
# plt.show()
plt.close()
# -----------------------------------------
# 根據(jù)城市分組排序,
list_group_city11 = []
list_group_city22 = []
list_group_city33 = []
list_group_city44 = []
for df_city1, df_city2 in data_df_python_keyword.groupby(data_df_python_keyword['城市']):
# print(df_city1)#市
# print(df_city2)
list_group_district2 = [] # 區(qū)縣列表
district_mean_salary2 = [] # 工資均值列表
for df_district1, df_district2 in df_city2.groupby(data_df_python_keyword['地區(qū)']):
# print(df_district1)#區(qū)縣
# print(df_district2)#工作
list_group_district2.append(df_district1) # 記錄區(qū)縣
salary_list_district2 = [int(i) for i in (df_district2['區(qū)間最小薪資(K)'].values.tolist())] # 工資列表
district_salary_mean2 = round(np.mean(salary_list_district2), 2) # 每個(gè)區(qū)縣的平均薪資 round(a, 2)保留2位小數(shù)
district_mean_salary2.append(district_salary_mean2) # 記錄區(qū)縣的平均工作的列表
district_mean_salary2, list_group_district2 = (list(tt) for tt in zip(
*sorted(zip(district_mean_salary2, list_group_district2), reverse=True)))
plt.figure(figsize=(10, 6))
plt.bar(list_group_district2, district_mean_salary2)
# 坐標(biāo)軸上的文字說(shuō)明
for ax, ay in zip(list_group_district2, district_mean_salary2):
# 設(shè)置文字說(shuō)明 第一、二個(gè)參數(shù):坐標(biāo)軸上的值; 第三個(gè)參數(shù):說(shuō)明文字;ha:垂直對(duì)齊方式;va:水平對(duì)齊方式
plt.text(ax, ay, '%.2f' % ay, ha='center', va='bottom')
plt.title('14個(gè)熱門(mén)城市的各區(qū)縣招聘工資情況_{}(K)'.format(df_city1))
plt.savefig('./image/city/14個(gè)熱門(mén)城市的各區(qū)縣招聘工資情況_{}(K).jpg'.format(df_city1))
# plt.show()
plt.close()
# ----------------------------------------------------
skill_all = data_df_python_keyword['技能']
print(skill_all)
skill_list = []
for i in skill_all:
# print(type(i))
print(i)
# print(i.split(", | ' | [ | ] | " | "))
result = re.split(r'[,' [, ] ]', i)
print(result)
# if type(i) == list:
skill_list = skill_list + result
print('++++++++++++++++++++++++++++++++')
# print(skill_list)
list_new = skill_list
# 詞頻統(tǒng)計(jì)
word_counts = collections.Counter(list_new) # 對(duì)分詞做詞頻統(tǒng)計(jì)
word_counts_top10 = word_counts.most_common(30) # 獲取前10最高頻的詞
# print (word_counts_top10) # 輸出檢查
# print (word_counts_top10[0][0]) # 輸出檢查
# 生成柱狀圖
list_x = []
list_y = []
for i in word_counts_top10:
list_x.append(i[0])
list_y.append(i[1])
print('list_x', list_x[1:])
print('list_y', list_y[1:])
plt.figure(figsize=(30, 5))
plt.bar(list_x[1:], list_y[1:])
plt.savefig('./image/技能棧_詞頻_柱狀圖.png')
# plt.show()
plt.close()
list_new = " ".join(list_new) # 列表轉(zhuǎn)字符串,以空格間隔
# print(list_new)
wc = wordcloud.WordCloud(
width=800,
height=600,
background_color="#ffffff", # 設(shè)置背景顏色
max_words=50, # 詞的最大數(shù)(默認(rèn)為200)
max_font_size=60, # 最大字體尺寸
min_font_size=10, # 最小字體尺寸(默認(rèn)為4)
# colormap='bone', # string or matplotlib colormap, default="viridis"
colormap='hsv', # string or matplotlib colormap, default="viridis"
random_state=20, # 設(shè)置有多少種隨機(jī)生成狀態(tài),即有多少種配色方案
# mask=plt.imread("mask2.gif"), # 讀取遮罩圖片??!
font_path='simhei.ttf'
)
my_wordcloud = wc.generate(list_new)
plt.imshow(my_wordcloud)
plt.axis("off")
# plt.show()
wc.to_file('./image/技能棧_詞云.png') # 保存圖片文件
plt.close()
四、學(xué)歷分析
五、工作經(jīng)驗(yàn)分析
六、14個(gè)熱門(mén)城市的各區(qū)縣招聘薪資情況
七、各城市各區(qū)縣的薪資情況
北京
上海
其余12個(gè)城市不再展示,生成代碼都一樣
八、技能棧
小結(jié)
到此這篇python怎么爬取招聘崗位數(shù)據(jù)的文章就介紹到這了,更多Python爬蟲(chóng)實(shí)戰(zhàn)內(nèi)容請(qǐng)搜索W3Cschool以前的文章或繼續(xù)瀏覽下面的相關(guān)文章。