几家旅游网站的爬虫

在Scrapy框架下

蚂蜂窝

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.qa import QAItem
from andaman.items.jieban import JiebanItem
__author__ = 'zephyre'
class MafengwoQaSpider(scrapy.Spider):
name = 'mafengwo-qa'
def parse(self, response):
html_text = json.loads(response.body)['payload']['list_html']
for href in Selector(text=html_text).xpath(
'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
url = urljoin(response.url, href)
yield Request(url=url, callback=self.parse_question)
def start_requests(self):
for start_idx in xrange(0, 500, 20):
yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)
def parse_question(self, response):
# 抓取相关问题
for related_href in response.selector.xpath(
'//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
url = urljoin(response.url, related_href)
yield Request(url=url, callback=self.parse_question)
q_item = self.retrive_question(response)
yield q_item
# 抓取回答
qid = q_item['qid']
page = 0
page_size = 50
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, page * page_size)
yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})
def retrive_question(self, response):
"""
分析response,得到问题
"""
tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
try:
user_href = tmp[0].xpath('./@href').extract()[0]
except IndexError:
self.logger.warning('Invalid response: %s' % response.url)
self.logger.warning(response.body)
raise
m = re.search(r'/wenda/u/(\d+)', user_href)
author_id = int(m.group(1))
tmp = tmp[0].xpath('./img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
author_name = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]
title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]
raw_contents = \
response.selector.xpath('//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-desc"]').extract()[0]
contents = html2text(raw_contents)
tmp = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]//span[@class="visit"]/text()').extract()[0]
view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))
time_str = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
timestamp = parse_time(time_str)
tmp = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
if tmp and tmp[0].strip():
topic = tmp[0].strip()
else:
topic = None
raw_tags = response.selector.xpath(
'//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-tags"]/a[@class="a-tag"]/text()').extract()
tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]
match = re.search(r'detail-(\d+)\.html', response.url)
qid = int(match.group(1))
item = QAItem()
item['source'] = 'mafengwo'
item['type'] = 'question'
item['qid'] = qid
item['title'] = title
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
if topic:
item['topic'] = topic
item['contents'] = contents
item['tags'] = tags
item['view_cnt'] = view_cnt
return item
def parse_answer_list(self, response):
meta = response.meta
qid = meta['qid']
page = meta['page']
page_size = meta['page_size']
sel = Selector(text=json.loads(response.body)['payload']['list_html'])
answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
if not answer_nodes:
return
# 查找下一页
if len(answer_nodes) == page_size:
next_page = page + 1
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, next_page * page_size)
yield Request(url=url, callback=self.parse_answer_list,
meta={'qid': qid, 'page': next_page, 'page_size': page_size})
for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
aid = int(answer_node.xpath('./@data-aid').extract()[0])
author_node = answer_node.xpath('./div[@class="person"]/div[contains(@class, "avatar") and @data-uid]')[0]
author_id = int(author_node.xpath('./@data-uid').extract()[0])
tmp = author_node.xpath('./a/img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
content_node = answer_node.xpath('./div[contains(@class,"answer-content")]')[0]
author_name = content_node.xpath('./div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]
time_str = content_node.xpath('./div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
timestamp = parse_time(time_str)
accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))
raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
contents = html2text(raw_contents)
try:
vote_cnt = int(answer_node.xpath('.//a[@class="btn-zan"]/span/text()').extract()[0])
except (IndexError, ValueError):
self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
vote_cnt = 0
item = QAItem()
item['type'] = 'answer'
item['source'] = 'mafengwo'
item['qid'] = qid
item['aid'] = aid
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
item['contents'] = contents
item['vote_cnt'] = vote_cnt
item['accepted'] = accepted
yield item
class MafengwoSpider(scrapy.Spider):
name = "mafengwo-jieban"
allowed_domains = ["mafengwo.cn"]
def start_requests(self):
total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
session_id = self.crawler.settings.get('MAFENGWO_SESSION_ID')
cookies = {'PHPSESSID': session_id} if session_id else {}
for i in range(total_page):
url = 'http://www.mafengwo.cn/together/ajax.php?act=getTogetherMore&flag=3&offset=%d&mddid=0&timeFlag=1' \
'&timestart=' % i
yield scrapy.Request(url, cookies=cookies)
def parse(self, response):
hrefs = scrapy.Selector(text=json.loads(response.body)['data']['html']).xpath('//li/a/@href').extract()
for href in hrefs:
url = 'http://www.mafengwo.cn/together/' + href
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
tid = int(str(response.xpath('//script[1]/text()').re(r'"tid":\d+')[0])[6:])
url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (0, tid)
total = int(str(response.xpath('//script[1]/text()').re(r'"total":\d+')[0][8:])) / 10 + 1
summary = response.xpath('//div[@class="summary"]')
item = JiebanItem()
item['source'] = 'mafengwo'
item['title'] = response.xpath('//title/text()').extract()[0]
item['start_time'] = summary.xpath('//div[@class="summary"]/ul/li[1]/span/text()').extract()[0].encode("UTF-8")[
15:]
item['days'] = summary.xpath('//div[@class="summary"]/ul/li[2]/span/text()').extract()[0].encode("UTF-8")[9:]
item['destination'] = summary.xpath('//div[@class="summary"]/ul/li[3]/span/text()').extract()[0].encode(
"UTF-8")[12:].split("/")
item['departure'] = summary.xpath('//div[@class="summary"]/ul/li[4]/span/text()').extract()[0].encode("UTF-8")[
12:]
item['people'] = summary.xpath('//div[@class="summary"]/ul/li[5]/span/text()').extract()[0].encode("UTF-8")[15:]
item['description'] = '\n'.join(filter(lambda v: v, [tmp.strip() for tmp in summary.xpath(
'//div[@class="desc _j_description"]/text()').extract()])).encode("UTF-8")
item['author_avatar'] = summary.xpath('//div[@class="sponsor clearfix"]/a/img/@src').extract()[0].encode(
"UTF-8")
item['comments'] = []
item['tid'] = tid
yield scrapy.Request(url,
meta={'item': item, 'page': 0, 'total': total, 'tid': tid}, callback=self.parse_comments)
def parse_comments(self, response):
item = response.meta['item']
page = response.meta['page'] + 1
body = scrapy.Selector(text=json.loads(response.body)['data']['html'])
if body.extract() != '<html></html>':
for node in body.xpath('//div[@class="vc_comment"]'):
try:
author_avatar = node.xpath('.//div[@class= "avatar"]/a/img/@src').extract()[0].encode("UTF-8")
author = node.xpath('.//a[@class="comm_name"]/text()').extract()[0].encode("UTF-8")
cid = int(node.xpath('.//div[@class="comm_reply"]/a/@data-cid').extract()[0].encode("UTF-8"))
comment = '\n'.join(
filter(lambda v: v, [tmp.strip() for tmp in node.xpath('.//p/text()').extract()])).encode(
"UTF-8")
comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
item['comments'].append(comment_item)
except IndexError:
self.logger.warning('Unable to extract comment from: %s' % (node.extract()))
if page <= response.meta['total']:
url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (page, item['tid'])
yield scrapy.Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
callback=self.parse_comments)
else:
yield item

pintour

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem
class PintourSpider(scrapy.Spider):
name = 'pintour'
allowed_domains = ['pintour.com']
def start_requests(self):
total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
for i in range(1, total_page):
url = 'http://www.pintour.com/list/0-0-0-0-2-1-s-0_%d' % i
yield scrapy.Request(url)
def parse(self, response):
metalist = Selector(text=response.body).xpath('//ul[@class="mateList"]/li/div/h3/a/@href').extract()
for href in metalist:
tid = int(href[1:])
url = 'http://www.pintour.com/%d' % tid
yield Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = JiebanItem()
item['source'] = 'pintour'
item['tid'] = int(response.url.split('/')[3])
item['title'] = response.xpath('//title/text()').extract()[0]
data = response.xpath('//div[@class="colBox clearfix"]')[0]
item['author'] = data.xpath('//div[@class="colBoxL clearfix"]/dl/dt/a/text()').extract()[0]
item['author_avatar'] = data.xpath('//div[@class="colBoxL clearfix"]/a/img/@src').extract()[0]
item['type'] = data.xpath('//div[@class="colBoxR"]/div//a/span/text()').extract()
time = data.xpath('.//div[@class="timePlace clearfix"]/p/text()').extract()[0]
item['start_time'] = time
item['departure'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()[0]
item['destination'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()
del item['destination'][0]
item['description'] = ' '.join(
filter(lambda v: v, [tmp.strip() for tmp in data.xpath('//div[@class="colBoxB"]//text()').extract()]))
item['comments'] = []
if re.search(r'\d+条回应', response.body):
reply_num = int(re.search(r'\d+条回应', response.body).group(0)[:-9])
total = reply_num / 20 + 1
url = 'http://www.pintour.com/%d_1' % item['tid']
yield Request(url,
meta={'item': item, 'page': 1, 'total': total, 'tid': item['tid']}, callback=self.parse_comments)
def parse_comments(self, response):
item = response.meta['item']
page = response.meta['page'] + 1
for node in response.xpath('//ul[@class="reply"]/li'):
author = node.xpath('.//div/input/@value').extract()[0]
author_avatar = node.xpath('.//a/img/@src').extract()[0]
comment = node.xpath('.//div/input/@value').extract()[2]
cid = int(node.xpath('.//div/@class').extract()[0].encode('UTF-8')[10:])
comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
item['comments'].append(comment_item)
if page <= response.meta['total']:
url = 'http://www.pintour.com/%d_%d' % (item['tid'], page)
yield Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
callback=self.parse_comments)
else:
yield item

ctrip

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem
class CtripSpider(scrapy.Spider):
name = 'ctrip'
def start_requests(self):
start_urls = [
'http://vacations.ctrip.com/tours',
'http://vacations.ctrip.com/tours/inter'
]
for url in start_urls:
yield Request(url)
def parse(self, response):
# 爬取城市列表
for city in response.xpath('//div[@class="sel_list"]/dl/dd/a/@href').extract():
num = int(re.search(r'\d+', str(city)).group(0))
url = 'http://you.ctrip.com/DangdiSite/events/%d.html' % num
yield Request(url, callback=self.parse_city)
def parse_city(self, response):
#爬取每个城市对应的页面的文章列表
for href in response.xpath('//ul[@class="cf"]/li/a/@href').extract():
url = urljoin(response.url, href)
yield Request(url, callback=self.parse_article)
def parse_article(self, response):
item = JiebanItem()
item['title'] = response.xpath('//title/text()').extract()[0]
item['tid'] = int(response.url.split('/')[5].split('.')[0])
if response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract():
item['author'] = response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract()[0]
else:
item['author'] = ''
eventsummaryinfoview = response.xpath('//div[@id="eventsummaryinfoview"]')
if eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract():
item['start_time'] = eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract()[0]
else:
item['start_time'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract():
item['days'] = eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract()[2]
else:
item['days'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
item['departure'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[1]
else:
item['departure'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
item['destination'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[2]
else:
item['destination'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract():
item['type'] = eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract()[0]
else:
item['type'] = ''
if response.xpath('//div[@class="events_infotext"]/p/text()').extract():
item['description'] = ' '.join(filter(lambda v: v, [tmp.strip() for tmp in response.xpath('//div[@class="events_infotext"]/p/text()').extract()]))
else:
item['description'] = ''
item['comments'] = []
frmdata = {"page": "1", "eventId": str(item['tid'])}
url = 'http://you.ctrip.com/CommunitySite/Activity/EventDetail/EventReplyListOrCommentList'
yield FormRequest(url, formdata=frmdata, method='POST',
meta={'item': item, 'page': 0}, callback=self.parse_comments)
def parse_comments(self, response):
logging.info(response.body)
pass

items

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# coding=utf-8
import scrapy
class JiebanItem(scrapy.Item):
# 数据来源
source = scrapy.Field()
#标题
title = scrapy.Field()
#出发时间
start_time = scrapy.Field()
#天数
days = scrapy.Field()
#出发地
destination = scrapy.Field()
#目的地
departure = scrapy.Field()
#预订人数
people= scrapy.Field()
#文章描述
description = scrapy.Field()
#作者头像URL
author_avatar = scrapy.Field()
#评论
comments = scrapy.Field()
#文章id
tid = scrapy.Field()
#旅行方式
type = scrapy.Field()
#文章作者
author = scrapy.Field()


pipelines

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# coding=utf-8
from datetime import datetime
from mongoengine import Document, EmbeddedDocument, EmbeddedDocumentField, StringField, IntField, ListField, connect
import logging
__author__ = 'golmic'
class Comments(EmbeddedDocument):
# 评论内容
comment = StringField()
# 评论作者
author = StringField()
# 作者头像Url
author_avatar = StringField()
# 评论id
cid = IntField()
class JiebanDocument(Document):
# 数据来源
source = StringField()
#文章标题
title = StringField()
# 出发时间
startTime = StringField()
# 预计天数
days = StringField()
# 目的地
destination = ListField()
# 出发地
departure = StringField()
# 预计人数
groupSize = StringField()
# 文章描述
description = StringField()
# 作者头像Url
authorAvatar = StringField()
# 文章id
tid = IntField()
# 文章评论
comments = ListField(EmbeddedDocumentField(Comments))
#作者
author = StringField()
#旅行方式
type = StringField()
class JiebanPipeline(object):
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('PIPELINE_JIEBAN_ENABLED', False):
from scrapy.exceptions import NotConfigured
raise NotConfigured
return cls(crawler.settings)
def __init__(self, settings):
self._conn = {}
self.init_db(settings)
@staticmethod
def init_db(settings):
mongo_uri = settings.get('ANDAMAN_MONGO_URI')
if mongo_uri:
return connect(host=mongo_uri)
else:
logging.error('Cannot find setting ANDAMAN_MONGO_URI, MongoDB connection is disabled')
def process_item(self, item, spider):
source = item['source']
title = item['title']
author = item.get('author', '')
start_time = item['start_time']
days = item['days']
destination = item['destination']
departure = item['departure']
people = item['people']
description = item['description']
author_avatar = item['author_avatar']
tid = item['tid']
comments = item['comments']
ops = {'set__startTime': start_time,
'set__source': source,
'set__author': author,
'set__title': title,
'set__days': days,
'set__destination': destination,
'set__departure': departure,
'set__groupSize': people,
'set__description': description,
'set__comments': comments,
'set__authorAvatar': author_avatar
}
JiebanDocument.objects(tid=tid).update_one(upsert=True, **ops)
return item

代理中间件以及爬虫代码,支持断点续爬。
settings.py

1
2
3
DOWNLOADER_MIDDLEWARES = {
'ctrip.middlewares.ProxyMiddleware': 543,
}

middlewares.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import base64
from scrapy.utils.project import get_project_settings
class ProxyMiddleware(object):
def process_request(self, request, spider):
settings = get_project_settings()
username = settings.get('PROXY_USERNAME')
password = settings.get('PROXY_PASSWORD')
port = settings.get('PROXY_PORT')
request.meta[
"proxy"] = 'http://%s:%s@%s' % (username, password, port)
proxy_user_pass = bytes(username.encode('UTF-8'))+b':'+bytes(password.encode('UTF-8'))
encoded_user_pass = base64.b64encode(proxy_user_pass)
request.headers['Proxy-Authorization'] = b'Basic ' + encoded_user_pass

spiders.xiecheng.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
import scrapy,os,re
from scrapy.http import Request
from bs4 import BeautifulSoup
class XiechengSpider(scrapy.Spider):
name = "xiecheng"
allowed_domains = ["ctrip.com"]
path = '/Users/lujianqiang/Development/xiecheng/'
def start_requests(self):
dir_set = set()
for line in open('/Users/lujianqiang/Development/ctrip/cx-city.sql','r'):
if '.html' in line:
dir_set.add(line.split(',')[3].split('/')[2].split('.')[0])
for dir_name in dir_set:
try:
os.mkdir(self.path+dir_name)
except FileExistsError:
pass
url = 'http://you.ctrip.com/travels/'+dir_name+'.html'
if not os.path.exists(self.path+dir_name+'/end.txt'):
if os.path.exists(self.path+dir_name+'/start.txt'):
for start_url in open(self.path+dir_name+'/start.txt','r'):
if not re.match('http://seccenter.ctrip.com/seccenter/main.aspx',start_url):
yield Request(start_url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
print('++++++++++++++'+dir_name)
def parse_list(self, response):
soup = BeautifulSoup(response.body, 'html.parser')
if not soup.find_all("a", class_='nextpage disabled'):
with open(self.path+response.meta['dir_name']+'/start.txt','w') as f:
f.write(response.url)
f.close()
nextpage = soup.find_all("a", class_='nextpage')
if nextpage:
yield Request('http://you.ctrip.com/'+nextpage[0]['href'],callback=self.parse_list,meta={'dir_name':response.meta['dir_name']})
else:
print('=============='+response.meta['dir_name'])
with open(self.path+response.meta['dir_name']+'/end.txt','wb') as f:
f.write(bytes('end',encoding = "utf8"))
f.close()
items = soup.find_all("a", class_='journal-item cf')
for item in items:
post = {}
post['numview'] = item.ul.find_all('i',class_='numview')[0].get_text()
post['want'] = item.ul.find_all('i',class_='want')[0].get_text()
post['numreply'] = item.ul.find_all('i',class_='numreply')[0].get_text()
filename = item['href'].split('/')[3]
if not os.path.exists(self.path+response.meta['dir_name']+'/'+filename):
yield Request('http://you.ctrip.com/'+item['href'],callback=self.parse_article,meta={'dir_name':response.meta['dir_name'],'post':post})
else:
print(filename+'exist')
def parse_article(self, response):
#http://you.ctrip.com/travels/hangzhou14/2869877.html
filename = self.path+response.meta['dir_name']+'/'+response.url.split('/')[6]
numview = response.meta['post']['numview']
want = response.meta['post']['want']
numreply = response.meta['post']['numreply']
string = '<numview>{}</numview><want>{}</want><numreply>{}</numreply><url>{}</url>'.format(numview,want,numreply,response.url)
with open(filename,'wb') as f:
f.write(bytes(string,encoding = "utf8")+response.body)

parse_html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
import os,re
from bs4 import BeautifulSoup
import pymongo
localdb = pymongo.MongoClient('mongodb://188.166.210.151',27017)['ctrip']
article_list = os.listdir()
for article_name in article_list:
if ".html" not in article_name:
continue
file = open(article_name,"r")
html = file.read()
if not html:
continue
soup = BeautifulSoup(html, 'html.parser')
post = {}
print(article_name)
post['title'] = soup.h2.get_text().strip()
if re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html):
post['date'] = re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html)[0]
else:
post['date'] = ''
post['author_name'] = soup.find_all('a',id='authorDisplayName')[0].get_text().strip()
post['author_url'] = 'http://you.ctrip.com' + soup.find_all('a',id='authorDisplayName')[0]['href']
days = soup.find_all('i',class_='days')
if days:
post['days'] = days[0].parent.get_text().split(':')[1].strip()
times = soup.find_all('i',class_='times')
if times:
post['times'] = times[0].parent.get_text().split(':')[1].strip()
costs = soup.find_all('i',class_='costs')
if costs:
post['costs'] = costs[0].parent.get_text().split(':')[1].strip()
whos = soup.find_all('i',class_='whos')
if whos:
post['whos'] = whos[0].parent.get_text().split(':')[1].strip()
gs_a_pois = soup.find_all('a',class_='gs_a_poi')
gs_a_poi_set = set()
for gs_a_poi in gs_a_pois:
gs_a_poi_set.add(gs_a_poi.get_text().strip('\n'))
print(gs_a_poi_set)
post['gs_a_poi'] = list(gs_a_poi_set)
post['url'] = soup.url.get_text()
post['numview'] = soup.numview.get_text()
post['want'] = soup.want.get_text()
post['numreply'] = soup.numreply.get_text()
print(post)
content = str(soup.find_all('div',class_='ctd_content')[0])
post['content'] = content
localdb.xiecheng.insert_one(post)