1/1
豆瓣top100電影信息爬蟲
用python爬蟲豆瓣電影TOP100的簡易信息
要收集的信息包括:每部電影的標題、導演、上映年份、評分。
先分析一下網(wǎng)頁結構,然后用 xpath 解析出想要的數(shù)據(jù),接著保存到 mongodb 數(shù)據(jù)庫中:
import requests
from lxml import etree
from chardet import detect
from pymongo import MongoClient
def douban_spider():
"""
豆瓣爬蟲調度器
:return: None
"""
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 總公有4頁,每頁間隔25
for i in range(0, 101, 25):
# 用 requests 發(fā)送請求獲取 html 文檔
url = 'https://www.douban.com/doulist/13704241/?start=' + str(i)
print(url)
response = requests.get(url, headers=headers)
# 用 xpath 規(guī)則解析 html 文檔
html = response.content.decode(detect(response.content).get('encoding'))
tree = etree.HTML(html)
page_parser(tree)
def page_parser(tree):
"""
頁面解析器
:return:
"""
for item in tree.xpath('//div[@class="article"]/div[@class="doulist-item"]'):
data = dict()
# 排名
data['ranking'] = item.xpath('.//div[@class="hd"]/span/text()')[0]
# 標題
data['title'] = ''.join(item.xpath('.//div[@class="bd doulist-subject"]/div[@class="title"]/a/text()')).strip()
abstract = item.xpath('.//div[@class="abstract"]/text()')
if abstract:
# 導演
data['director'] = ''.join(abstract[0]).strip().split(':')[-1].strip()
# 上映年份
try:
data['year'] = ''.join(abstract[4]).strip().split(':')[-1].strip()
except Exception as e:
data['year'] = ''.join(abstract[3]).strip().split(':')[-1].strip()
# 評分
rating_num = item.xpath('.//span[@class="rating_nums"]/text()')
if rating_num:
data['rating_num'] = rating_num[0]
save_data(data)
def save_data(data):
"""
將爬取的數(shù)據(jù)寫入 mongodb 數(shù)據(jù)庫
:return: bool 數(shù)據(jù)是否保存成功
"""
client = MongoClient()
db = client.douban
collection = db['top100']
collection.insert_one(data)
def main():
douban_spider()
if __name__ == '__main__':
main()
最后輸出到douban_movie.csv里,打開后是這樣的~