欢迎来到飞鸟慕鱼博客,开始您的技术之旅!
当前位置: 首页知识笔记正文

爬虫 安居客,安居客网页抓取

终极管理员 知识笔记 134阅读

在上一篇文章python使用代理爬取安居客二手房数据一上增加爬取内容
爬取内容为‘待售房屋’, ‘室’, ‘厅’, ‘卫’, ‘面积’, ‘面积单位’, ‘朝向’, ‘楼层’, ‘建筑年份’,
‘小区名称’, ‘区’, ‘镇’, ‘道路’, ‘标签’, ‘总价’, ‘总价单位’, ‘均价’, ‘均价单位’ 并使用多线程提高爬取速度

爬取网址管理器

author rubyw爬虫的url管理器class CrawlerUrlManager():    def __init__(self):        self.new_urls  set()        self.old_urls  set()    # 新增一个待爬取Url    def add_new_url(self, url):        if url is None or len(url)  0:            return        if url in self.new_urls or url in self.old_urls:            return        self.new_urls.add(url)        return True    # 批量新增url    def add_new_urls(self, urls):        if urls is None or len(urls)  0:            return        for url in urls:            self.add_new_url(url)    # 获取一个要爬取的url    def get_url(self):        if self.has_new_url():            url  self.new_urls.pop()            self.old_urls.add(url)            return url        else:            return None    # 批量获取待爬取的url    def get_new_urls(self, num):        returnUrls  set()        if num is None or type(num) ! int or num < 0:            return returnUrls        else:            i  0            while self.has_new_url() and i < num:                url  self.new_urls.pop()                self.old_urls.add(url)                returnUrls.add(url)                i  i  1        return returnUrls    # 判断是否有待爬取的url    def has_new_url(self):        return len(self.new_urls) > 0    # 获取待爬取url的数量    def get_new_url_size(self):        return len(self.new_urls)    # 获取已爬取url的数量    def get_old_url_size(self):        return len(self.old_urls)if __name__  __main__:    url_manager  CrawlerUrlManager()    # 添加两个url批量添加故意添加一个重复的url看去重是否ok    url_manager.add_new_url(url1)    url_manager.add_new_urls([url1, url2])    print(url_manager.new_urls, url_manager.old_urls)    # 获取一个url然后打印两个集合    print(# * 30)    new_url  url_manager.get_url()    print(url_manager.new_urls, url_manager.old_urls)    # 再获取一个url然后打印两个集合    print(# * 30)    new_url  url_manager.get_url()    print(url_manager.new_urls, url_manager.old_urls)    # 看看两个集合中还有没有Url    print(# * 30)    print(url_manager.has_new_url())

爬取二手房数据

爬取安居客网站苏州的二手房数据启动方法进入/anjuke目录下执行python secondhand_house_crawler.pyimport requestsfrom bs4 import BeautifulSoupimport threadingimport timeimport csvfrom crawlUrlManager import CrawlerUrlManagerdef get_proxies():    proxy_list  []    proxy_url    # 替换成自己的    try:        datas  requests.get(proxy_url).json()        print(datas[code])        # 如果代理ip获取成功        if datas[code]  0:            proxy_list  datas[data][proxy_list]            # data_array  datas[data]            # for i in range(len(data_array)):            #     proxy_ip  data_array[i][ip]            #     proxy_port  str(data_array[i][port])            #     proxy  proxy_ip  :  proxy_port            #     proxy_list.append({http: http://  proxy, https: http://  proxy})        else:            code  datas[code]            print(f获取代理失败状态码{code})        return proxy_list    except Exception as e:        # print(调用天启API获取代理IP异常:  e)        print(调用快代理API获取代理IP异常:  e)        return proxy_listdef craw_anjuke_wuhan(craw_url, proxy):    if craw_url is None:        print(threading.current_thread().getName()   craw_url is None)        return    # 用户名密码认证(私密代理/独享代理)    username    # 替换成自己的    password    # 替换成自己的    proxies  {        http:  % {user: username, pwd: password,                                                        proxy: proxy},        https:  % {user: username, pwd: password,                                                         proxy: proxy}    }    print(proxies)    print(threading.current_thread().getName()  f is crawing {craw_url}...使用代理{proxy})    # 构造url的request headers伪装成正常用户    headers  {        accept: text/html,application/xhtmlxml,application/xml;q0.9,image/avif,image/webp,image/apng,*/*;q0.8,application/signed-exchange;vb3;q0.7,        accept-encoding: gzip, deflate, br,        accept-language: zh-CN,zh;q0.9,        cache-control: no-cache,        cookie: aQQ_ajkguid70C0288A-42CB-4C56-B8EF-8E90F8077A8C; sessid13C76F04-9178-4EE8-B8B0-F00FE21F4F50; ajk-appVersion; ctid22; fzq_hd23302afd92c82b304657a734e3950aa_1697613588983_b645e9292cff4c148c0e3fb2ff31662e_3746354997; id58CrIej2Uvhxc/D8k8IRI2Ag; twe2; fzq_js_anjuke_ershoufang_pc8e86fa86290dbac07d5de51dd3b9db13_1697615100824_23; obtain_by1; xxzl_cid817f908b661647889fa49debaab80d9c; xxzl_deviceidlrdQ4FRXrfXyN2Qj/gRhBw2SQpTZ81igKeOBCkzlfzjPwEG8whpE1uKNvVqIOvXQ,        host: wuhan.anjuke.com,        pragma: no-cache,        referer:         sec-ch-ua: Google Chrome;v117, Not;ABrand;v8, Chromium;v117,        sec-ch-ua-mobile: ?0,        sec-ch-ua-platform: Windows,        sec-fetch-dest: document,        sec-fetch-mode: navigate,        sec-fetch-site: same-origin,        sec-fetch-user: ?1,        upgrade-insecure-requests: 1,        user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36    }    with open(data/wuhanSecondHouse.csv, a, newline, encodinggbk) as f:        # 有代理用代理没代理直接爬        if proxy is None:            r  requests.get(craw_url, headersheaders, timeout3)        else:            r  requests.get(craw_url, headersheaders, proxiesproxies, timeout3)        # 如果正常返回结果开始解析        if r.status_code  200:            content  r.text            # print(content)            soup  BeautifulSoup(content, html.parser)            content_div_nodes  soup.find_all(div, class_property-content)            for content_div_node in content_div_nodes:                # 获取房产标题内容                content_title_name  content_div_node.find(h3, class_property-content-title-name)                title_name  content_title_name.get_text()                # 获取房子户型                content_layout  content_div_node.find(p,                                                       class_property-content-info-text property-content-info-attribute)                layout_datas  content_layout.find_all(span)                datas_shi  layout_datas[0].get_text()  layout_datas[1].get_text()                datas_ting  layout_datas[2].get_text()  layout_datas[3].get_text()                datas_wei  layout_datas[4].get_text()  layout_datas[5].get_text()                # 获取房子的面积、朝向、楼层和建筑年份                square_num                  square_unit                  orientations                  floor_level                  build_year                  content_extra_info_datas  content_div_node.find_all(                    lambda content_div_node: content_div_node.name  p and content_div_node.get(class)  [                        property-content-info-text])                for i in range(len(content_extra_info_datas)):                    if i  0:                        square  content_extra_info_datas[0].get_text().strip()                        square_num  square[0:len(square) - 1]                        square_unit  square[len(square) - 1:]                    if i  1:                        orientations  content_extra_info_datas[1].get_text().strip()                    if i  2:                        floor_level  content_extra_info_datas[2].get_text().strip()                    if i  3:                        build_year  content_extra_info_datas[3].get_text().strip()                # 获取房子的小区名称、位置信息区-镇-道路                content_info_comm  content_div_node.find(div,                                                          class_property-content-info property-content-info-comm)                # 获取小区名称                housing_estate  content_info_comm.find(p,                                                        class_property-content-info-comm-name).get_text().strip()                # 获取小区地址信息                content_info_address  content_info_comm.find(p,                                                              class_property-content-info-comm-address).find_all(                    span)                district  content_info_address[0].get_text().strip()                town  content_info_address[1].get_text().strip()                road  content_info_address[2].get_text().strip()                # 获取房子的更多tag信息比如朝向、是否满五唯一、房子新旧、是否近地铁等                content_info_tag  content_div_node.find_all(span, class_property-content-info-tag)                tagstr                  for i in range(len(content_info_tag)):                    tagstr  tagstr  content_info_tag[i].get_text().strip()  ,                # 获取房子价格信息                price_info_datas  content_div_node.find(div, class_property-price)                total_price  price_info_datas.find(span, class_property-price-total-num).get_text().strip()                total_price_unit  price_info_datas.find(span, class_property-price-total-text).get_text().strip()                avarage_price  price_info_datas.find(p, class_property-price-average).get_text().strip()                avarage_price_num  avarage_price[0:len(avarage_price) - 3]                avarage_price_unit  avarage_price[len(avarage_price) - 3:]                # 输出到文件                writer  csv.writer(f)                writer.writerow([title_name, datas_shi, datas_ting, datas_wei, square_num, square_unit, orientations, floor_level,                build_year, housing_estate, district, town, road, tagstr, total_price, total_price_unit,                avarage_price_num, avarage_price_unit])                # f.write(%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s\n % (                # title_name, datas_shi, datas_ting, datas_wei, square_num, square_unit, orientations, floor_level,                # build_year, housing_estate, district, town, road, tagstr, total_price, total_price_unit,                # avarage_price_num, avarage_price_unit))            print(f{threading.current_thread().getName()} crawl over!;Crawler Url is:{craw_url})        else:            print(                f{threading.current_thread().getName()} crawl fail!status code{r.status_code};Crawler Url is:{craw_url})if __name__  __main__:    # 先将标题写入结果数据文件    with open(data/wuhanSecondHouse.csv, w, newline, encodinggbk) as f:        writer  csv.writer(f)        writer.writerow(        [待售房屋, 室, 厅, 卫, 面积, 面积单位, 朝向, 楼层, 建筑年份, 小区名称, 区, 镇, 道路, 标签, 总价, 总价单位, 均价,        均价单位]        )    # 假设爬取crawler_pages页生成待爬取的url放入url池管理起来    crawlerUrlManager  CrawlerUrlManager()    # 要爬取的页数默认为100可调整    crawler_pages  200    for i in range(crawler_pages):        region  jianghana        url          craw_url  url.format(regionregion, pagei)        crawlerUrlManager.add_new_url(craw_url)    # 尝试获取代理ip避免同一个ip频繁访问被网站的反爬机制给封禁    proxy_list  get_proxies()    proxy_num  len(proxy_list)    if proxy_num > 2:  # 如果获取到代理ip则用代理ip建议至少获取5个及以上的代理ip爬取的时候每个线程一个ip进行爬取        print(f获取到{proxy_num}个代理ip开始使用代理IP爬取页面数据...)        while crawlerUrlManager.has_new_url():            crawler_threads  []            for i in range(len(proxy_list)):                proxy  proxy_list[i]                print(crawlerUrlManager.get_url())                crawler_thread  threading.Thread(craw_anjuke_wuhan(crawlerUrlManager.get_url(), proxy))                crawler_threads.append(crawler_thread)            # 启动线程开始爬取            for crawler_thread in crawler_threads:                crawler_thread.start()            for crawler_thread in crawler_threads:                crawler_thread.join()            # 谨慎起见一批线程爬取结束后间隔一段时间再启动下一批爬取这里默认设置为3秒可调整            time.sleep(3)    else:  # 如果没获取到代理ip则直接爬取控制一下每个线程爬取的间隔时间不要太频繁        try:            print(没有获取到代理IP开始使用自身IP爬取页面数据...)            while crawlerUrlManager.has_new_url():                crawler_thread  threading.Thread(targetcraw_anjuke_wuhan, args(crawlerUrlManager.get_url(), None))                crawler_thread.start()                crawler_thread.join()                time.sleep(10)  # 为避免同一个ip频繁爬取被反爬封禁一线程爬取完后等待10秒再爬取下一个页面        except Exception as e:            print(Crawler Excepiton:  e)        finally:            print(f已爬取的url数量{crawlerUrlManager.get_old_url_size()})            print(f未爬取的url数量{crawlerUrlManager.get_new_url_size()})            if crawlerUrlManager.get_new_url_size() > 0:                print(未爬取的url如下)                for new_url in crawlerUrlManager.get_url():                    print(f{new_url})

标签:
声明:无特别说明,转载请标明本文来源!