爬了杭州的租房数据,原来……( 二 )


我先是爬取了所有的城市数据,虽然我们这次只关心杭州的情况,不过抓下来所有的城市,以后也用得到 。打开网站我就去找Json数据API,发现并没有,所以只能采取普通的提取页面数据的方式来获取数据了 。具体的代码如下:
"""增量爬取房天下-所有城市的主页该爬虫,一般情况只需要爬取一次就够了:因为中国的城市变化,个人觉得是不频繁的页面:http://www.fang.com/SoufunFamily.htm"""from scrapy import Selectorfrom scrapy.spiders import Spiderfrom thor_crawl.spiders.spider_setting import DEFAULT_DB_ENVfrom thor_crawl.utils.commonUtil import CommonUtilfrom thor_crawl.utils.db.daoUtil import DaoUtilsclass CityIndex(Spider):name = 'sou_fang_city_index'handle_httpstatus_list = [204, 206, 404, 500]start_urls = ['http://www.fang.com/SoufunFamily.htm']def __init__(self, *args, **kwargs):super().__init__(*args, **kwargs)# ============ 工具 ============self.dao = DEFAULT_DB_ENVself.common_util = CommonUtil()# ============ 持久化相关变量定义 ============self.save_threshold = 20# 一次性插入数据库阈值self.persistent_data = http://www.kingceram.com/post/list()# 内存暂存处理的数据,批量插入数据库self.main_table = 'sou_fang_city_index'# 数据库存储表def __del__(self):self.save_final()def closed(self, res):self.save_final()def parse(self, response):try:body = response.body.decode('gb18030').encode('utf-8')except UnicodeDecodeError as e:print(e)body = response.bodyhxf = Selector(text=body)trs = hxf.xpath('//div[@id="c02"]/table/tr')# 获取所有的行数据this_province = '未知'for tr in trs[:-1]:province_name = self.common_util.get_extract(tr.xpath('td[2]/strong/text()'))# 获取省份名称文本值this_province = this_province if province_name is None or province_name == '' else province_name# 为空的话取之前的省份名称cities = tr.xpath('td[3]/a')# 获取所有的城市列表for city in cities:city_name = self.common_util.get_extract(city.xpath('text()'))# 获取城市名称文本值city_index_url = self.common_util.get_extract(city.xpath('@href'))# 获取城市首页链接self.persistent_data.append({'province_name': this_province,'city_name': city_name,'city_index_url': city_index_url})self.save()def save(self):if len(self.persistent_data) > self.save_threshold:try:self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)except AttributeError as e:self.dao = DaoUtils()self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)print('save except:', e)finally:self.persistent_data = http://www.kingceram.com/post/list()def save_final(self):if len(self.persistent_data)> 0:try:self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)except AttributeError as e:self.dao = DaoUtils()self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)print('save_final except:', e)finally:self.persistent_data = http://www.kingceram.com/post/list()
然后是爬取杭州所有的出租房源数据,思路是通过杭州这个城市站的首页的“租房”菜单,进入房源列表,然后,根据不同的城区,去爬取数据,具体代码如下:
"""搜房网-租房信息"""import reimport scrapyfrom scrapy import Selectorfrom scrapy.spiders import Spiderfrom thor_crawl.spiders.spider_setting import DEFAULT_DB_ENVfrom thor_crawl.utils.commonUtil import CommonUtilfrom thor_crawl.utils.db.daoUtil import DaoUtilsclass Renting(Spider):name = 'sou_fang_renting'handle_httpstatus_list = [302, 204, 206, 404, 500]start_urls = ['http://www.souFang.com/SoufunFamily.htm']def __init__(self, *args, **kwargs):super().__init__(*args, **kwargs)# ============ 工具 ============self.dao = DEFAULT_DB_ENVself.common_util = CommonUtil()# ============ 持久化相关变量定义 ============self.save_threshold = 20# 一次性插入数据库阈值self.persistent_data = http://www.kingceram.com/post/list()# 内存暂存处理的数据,批量插入数据库self.main_table = 'sou_fang_renting'# 数据库存储表# ============ 业务 ============province_name = '浙江'city_name = '杭州'self.target = 'SELECT id, province_name, city_name, city_index_url ' /'FROM sou_fang_city_index ' /'WHERE province_name ="{province_name}" and city_name = "{city_name}"'.format(province_name=province_name, city_name=city_name)self.url_template = 'http://{city_code}.zu.fang.com/'# 租房首页的模板URLdef __del__(self):self.save_final()def start_requests(self):start_requests = list()for row in self.dao.get_all(self.target):if row['city_index_url'] != '':meta = {'city_index_id': row['id'],'province_name': row['province_name'],'city_name': row['city_name']}url = self.url_template.format(city_code=re.search(r'http://(.+)\.fang\.com', row['city_index_url']).group(1))start_requests.append(scrapy.FormRequest(url=url, method='GET', meta=meta))return start_requestsdef closed(self, res):self.save_final()# 拿到所有的地区,去掉"不限"def parse(self, response):try:body = response.body.decode('gb18030').encode('utf-8')except UnicodeDecodeError as e:print(e)body = response.bodymeta = response.metaurl = response.urlhxf = Selector(text=body)a_tag_list = hxf.xpath('//dl[@id="rentid_D04_01"]/dd/a')print('a_tag_list len: ', len(a_tag_list))if a_tag_list is None or len(a_tag_list)