scrapy爬取一帧创作者排行

spider

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import YizhenItem
class A1zhenSpider(scrapy.Spider):
name = '1zhen'
allowed_domains = ['www.1zhen.com']
types = ['week', 'month']
pt_weeks = [str(i) for i in range(201737, 201745)]
pt_months = [str(i) for i in range(201709, 201711)]
platforms = ['all', 'tencent', 'sohu', 'iqiyi', 'youku',
'toutiao', 'meipai', 'miaopai', 'bilibili']
categories = [str(i) for i in range(1, 20)]
categories_mapping = {
'1': '全部',
'2': '汽车',
'3': '动漫',
'4': '亲子',
'5': '财经',
'6': '娱乐',
'7': '时尚',
'8': '美食',
'9': '搞笑',
'10': '游戏',
'11': '健康',
'12': '资讯',
'13': '知识',
'14': '生活',
'15': '音乐',
'16': '萌宠',
'17': '运动',
'18': '科技',
'19': '旅游'
}
def start_requests(self):
for platform in self.platforms:
for category in self.categories:
for type in self.types:
meta = {
'type': type,
'platform': platform,
'category': category
}
if type == 'week':
for pt_week in self.pt_weeks:
meta['pt'] = pt_week
url = 'http://www.1zhen.com/api/v2/rank/ip?type=week&pt_week={}&platform={}&category={}'.format(
pt_week, platform, category)
yield scrapy.Request(url, meta=meta)
elif type == 'month':
for pt_month in self.pt_months:
meta['pt'] = pt_month
url = 'http://www.1zhen.com/api/v2/rank/ip?type=month&pt_month={}&platform={}&category={}'.format(
pt_month, platform, category)
yield scrapy.Request(url, meta=meta)
else:
return
# url = 'http://www.1zhen.com/api/v2/rank/ip?type=week&pt_week=201737&platform=all&category=1'
# meta = {
# 'type': 'week',
# 'platform': 'all',
# 'category': '1',
# 'pt': '201737'
# }
# yield scrapy.Request(url, meta=meta)
def parse(self, response):
result = json.loads(response.text)
errcode = result['code']
if errcode != 0:
return
data = result['data']
meta = response.meta
item = YizhenItem({
'type': meta['type'],
'platform': meta['platform'],
'category': self.categories_mapping[meta['category']],
'pt': meta['pt']
})
item['ranks'] = data['ranks']
yield item

items.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YizhenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
type = scrapy.Field()
pt = scrapy.Field()
platform = scrapy.Field()
category = scrapy.Field()
ranks = scrapy.Field()

pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class MongoPipeline(object):
collection_name = 'yizhen_rank'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'yizhen')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item