爬虫 安居客,安居客网页抓取
终极管理员 知识笔记 134阅读
在上一篇文章python使用代理爬取安居客二手房数据一上增加爬取内容
爬取内容为‘待售房屋’, ‘室’, ‘厅’, ‘卫’, ‘面积’, ‘面积单位’, ‘朝向’, ‘楼层’, ‘建筑年份’,
‘小区名称’, ‘区’, ‘镇’, ‘道路’, ‘标签’, ‘总价’, ‘总价单位’, ‘均价’, ‘均价单位’ 并使用多线程提高爬取速度
爬取网址管理器

author rubyw爬虫的url管理器class CrawlerUrlManager(): def __init__(self): self.new_urls set() self.old_urls set() # 新增一个待爬取Url def add_new_url(self, url): if url is None or len(url) 0: return if url in self.new_urls or url in self.old_urls: return self.new_urls.add(url) return True # 批量新增url def add_new_urls(self, urls): if urls is None or len(urls) 0: return for url in urls: self.add_new_url(url) # 获取一个要爬取的url def get_url(self): if self.has_new_url(): url self.new_urls.pop() self.old_urls.add(url) return url else: return None # 批量获取待爬取的url def get_new_urls(self, num): returnUrls set() if num is None or type(num) ! int or num < 0: return returnUrls else: i 0 while self.has_new_url() and i < num: url self.new_urls.pop() self.old_urls.add(url) returnUrls.add(url) i i 1 return returnUrls # 判断是否有待爬取的url def has_new_url(self): return len(self.new_urls) > 0 # 获取待爬取url的数量 def get_new_url_size(self): return len(self.new_urls) # 获取已爬取url的数量 def get_old_url_size(self): return len(self.old_urls)if __name__ __main__: url_manager CrawlerUrlManager() # 添加两个url批量添加故意添加一个重复的url看去重是否ok url_manager.add_new_url(url1) url_manager.add_new_urls([url1, url2]) print(url_manager.new_urls, url_manager.old_urls) # 获取一个url然后打印两个集合 print(# * 30) new_url url_manager.get_url() print(url_manager.new_urls, url_manager.old_urls) # 再获取一个url然后打印两个集合 print(# * 30) new_url url_manager.get_url() print(url_manager.new_urls, url_manager.old_urls) # 看看两个集合中还有没有Url print(# * 30) print(url_manager.has_new_url())
爬取二手房数据
爬取安居客网站苏州的二手房数据启动方法进入/anjuke目录下执行python secondhand_house_crawler.pyimport requestsfrom bs4 import BeautifulSoupimport threadingimport timeimport csvfrom crawlUrlManager import CrawlerUrlManagerdef get_proxies(): proxy_list [] proxy_url # 替换成自己的 try: datas requests.get(proxy_url).json() print(datas[code]) # 如果代理ip获取成功 if datas[code] 0: proxy_list datas[data][proxy_list] # data_array datas[data] # for i in range(len(data_array)): # proxy_ip data_array[i][ip] # proxy_port str(data_array[i][port]) # proxy proxy_ip : proxy_port # proxy_list.append({http: http:// proxy, https: http:// proxy}) else: code datas[code] print(f获取代理失败状态码{code}) return proxy_list except Exception as e: # print(调用天启API获取代理IP异常: e) print(调用快代理API获取代理IP异常: e) return proxy_listdef craw_anjuke_wuhan(craw_url, proxy): if craw_url is None: print(threading.current_thread().getName() craw_url is None) return # 用户名密码认证(私密代理/独享代理) username # 替换成自己的 password # 替换成自己的 proxies { http: % {user: username, pwd: password, proxy: proxy}, https: % {user: username, pwd: password, proxy: proxy} } print(proxies) print(threading.current_thread().getName() f is crawing {craw_url}...使用代理{proxy}) # 构造url的request headers伪装成正常用户 headers { accept: text/html,application/xhtmlxml,application/xml;q0.9,image/avif,image/webp,image/apng,*/*;q0.8,application/signed-exchange;vb3;q0.7, accept-encoding: gzip, deflate, br, accept-language: zh-CN,zh;q0.9, cache-control: no-cache, cookie: aQQ_ajkguid70C0288A-42CB-4C56-B8EF-8E90F8077A8C; sessid13C76F04-9178-4EE8-B8B0-F00FE21F4F50; ajk-appVersion; ctid22; fzq_hd23302afd92c82b304657a734e3950aa_1697613588983_b645e9292cff4c148c0e3fb2ff31662e_3746354997; id58CrIej2Uvhxc/D8k8IRI2Ag; twe2; fzq_js_anjuke_ershoufang_pc8e86fa86290dbac07d5de51dd3b9db13_1697615100824_23; obtain_by1; xxzl_cid817f908b661647889fa49debaab80d9c; xxzl_deviceidlrdQ4FRXrfXyN2Qj/gRhBw2SQpTZ81igKeOBCkzlfzjPwEG8whpE1uKNvVqIOvXQ, host: wuhan.anjuke.com, pragma: no-cache, referer: sec-ch-ua: Google Chrome;v117, Not;ABrand;v8, Chromium;v117, sec-ch-ua-mobile: ?0, sec-ch-ua-platform: Windows, sec-fetch-dest: document, sec-fetch-mode: navigate, sec-fetch-site: same-origin, sec-fetch-user: ?1, upgrade-insecure-requests: 1, user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 } with open(data/wuhanSecondHouse.csv, a, newline, encodinggbk) as f: # 有代理用代理没代理直接爬 if proxy is None: r requests.get(craw_url, headersheaders, timeout3) else: r requests.get(craw_url, headersheaders, proxiesproxies, timeout3) # 如果正常返回结果开始解析 if r.status_code 200: content r.text # print(content) soup BeautifulSoup(content, html.parser) content_div_nodes soup.find_all(div, class_property-content) for content_div_node in content_div_nodes: # 获取房产标题内容 content_title_name content_div_node.find(h3, class_property-content-title-name) title_name content_title_name.get_text() # 获取房子户型 content_layout content_div_node.find(p, class_property-content-info-text property-content-info-attribute) layout_datas content_layout.find_all(span) datas_shi layout_datas[0].get_text() layout_datas[1].get_text() datas_ting layout_datas[2].get_text() layout_datas[3].get_text() datas_wei layout_datas[4].get_text() layout_datas[5].get_text() # 获取房子的面积、朝向、楼层和建筑年份 square_num square_unit orientations floor_level build_year content_extra_info_datas content_div_node.find_all( lambda content_div_node: content_div_node.name p and content_div_node.get(class) [ property-content-info-text]) for i in range(len(content_extra_info_datas)): if i 0: square content_extra_info_datas[0].get_text().strip() square_num square[0:len(square) - 1] square_unit square[len(square) - 1:] if i 1: orientations content_extra_info_datas[1].get_text().strip() if i 2: floor_level content_extra_info_datas[2].get_text().strip() if i 3: build_year content_extra_info_datas[3].get_text().strip() # 获取房子的小区名称、位置信息区-镇-道路 content_info_comm content_div_node.find(div, class_property-content-info property-content-info-comm) # 获取小区名称 housing_estate content_info_comm.find(p, class_property-content-info-comm-name).get_text().strip() # 获取小区地址信息 content_info_address content_info_comm.find(p, class_property-content-info-comm-address).find_all( span) district content_info_address[0].get_text().strip() town content_info_address[1].get_text().strip() road content_info_address[2].get_text().strip() # 获取房子的更多tag信息比如朝向、是否满五唯一、房子新旧、是否近地铁等 content_info_tag content_div_node.find_all(span, class_property-content-info-tag) tagstr for i in range(len(content_info_tag)): tagstr tagstr content_info_tag[i].get_text().strip() , # 获取房子价格信息 price_info_datas content_div_node.find(div, class_property-price) total_price price_info_datas.find(span, class_property-price-total-num).get_text().strip() total_price_unit price_info_datas.find(span, class_property-price-total-text).get_text().strip() avarage_price price_info_datas.find(p, class_property-price-average).get_text().strip() avarage_price_num avarage_price[0:len(avarage_price) - 3] avarage_price_unit avarage_price[len(avarage_price) - 3:] # 输出到文件 writer csv.writer(f) writer.writerow([title_name, datas_shi, datas_ting, datas_wei, square_num, square_unit, orientations, floor_level, build_year, housing_estate, district, town, road, tagstr, total_price, total_price_unit, avarage_price_num, avarage_price_unit]) # f.write(%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s\n % ( # title_name, datas_shi, datas_ting, datas_wei, square_num, square_unit, orientations, floor_level, # build_year, housing_estate, district, town, road, tagstr, total_price, total_price_unit, # avarage_price_num, avarage_price_unit)) print(f{threading.current_thread().getName()} crawl over!;Crawler Url is:{craw_url}) else: print( f{threading.current_thread().getName()} crawl fail!status code{r.status_code};Crawler Url is:{craw_url})if __name__ __main__: # 先将标题写入结果数据文件 with open(data/wuhanSecondHouse.csv, w, newline, encodinggbk) as f: writer csv.writer(f) writer.writerow( [待售房屋, 室, 厅, 卫, 面积, 面积单位, 朝向, 楼层, 建筑年份, 小区名称, 区, 镇, 道路, 标签, 总价, 总价单位, 均价, 均价单位] ) # 假设爬取crawler_pages页生成待爬取的url放入url池管理起来 crawlerUrlManager CrawlerUrlManager() # 要爬取的页数默认为100可调整 crawler_pages 200 for i in range(crawler_pages): region jianghana url craw_url url.format(regionregion, pagei) crawlerUrlManager.add_new_url(craw_url) # 尝试获取代理ip避免同一个ip频繁访问被网站的反爬机制给封禁 proxy_list get_proxies() proxy_num len(proxy_list) if proxy_num > 2: # 如果获取到代理ip则用代理ip建议至少获取5个及以上的代理ip爬取的时候每个线程一个ip进行爬取 print(f获取到{proxy_num}个代理ip开始使用代理IP爬取页面数据...) while crawlerUrlManager.has_new_url(): crawler_threads [] for i in range(len(proxy_list)): proxy proxy_list[i] print(crawlerUrlManager.get_url()) crawler_thread threading.Thread(craw_anjuke_wuhan(crawlerUrlManager.get_url(), proxy)) crawler_threads.append(crawler_thread) # 启动线程开始爬取 for crawler_thread in crawler_threads: crawler_thread.start() for crawler_thread in crawler_threads: crawler_thread.join() # 谨慎起见一批线程爬取结束后间隔一段时间再启动下一批爬取这里默认设置为3秒可调整 time.sleep(3) else: # 如果没获取到代理ip则直接爬取控制一下每个线程爬取的间隔时间不要太频繁 try: print(没有获取到代理IP开始使用自身IP爬取页面数据...) while crawlerUrlManager.has_new_url(): crawler_thread threading.Thread(targetcraw_anjuke_wuhan, args(crawlerUrlManager.get_url(), None)) crawler_thread.start() crawler_thread.join() time.sleep(10) # 为避免同一个ip频繁爬取被反爬封禁一线程爬取完后等待10秒再爬取下一个页面 except Exception as e: print(Crawler Excepiton: e) finally: print(f已爬取的url数量{crawlerUrlManager.get_old_url_size()}) print(f未爬取的url数量{crawlerUrlManager.get_new_url_size()}) if crawlerUrlManager.get_new_url_size() > 0: print(未爬取的url如下) for new_url in crawlerUrlManager.get_url(): print(f{new_url})

标签: