亚洲免费在线-亚洲免费在线播放-亚洲免费在线观看-亚洲免费在线观看视频-亚洲免费在线看-亚洲免费在线视频

python爬蟲 爬取58同城上所有城市的租房信息詳解

系統 1549 0

代碼如下

            
from fake_useragent import UserAgent
from lxml import etree
import requests, os
import time, re, datetime
import base64, json, pymysql
from fontTools.ttLib import TTFont

ua = UserAgent()


class CustomException(Exception):

  def __init__(self, status, msg):
    self.status = status
    self.msg = msg


class City_58:
  '''
  58同城的爬蟲類,目前就寫這兩個
  出租房url: https://cd.58.com/chuzu/     cd代表成都縮寫
  二手房url: https://cd.58.com/ershoufang/
  '''

  font_dict = {
    "glyph00001": "0",
    "glyph00002": "1",
    "glyph00003": "2",
    "glyph00004": "3",
    "glyph00005": "4",
    "glyph00006": "5",
    "glyph00007": "6",
    "glyph00008": "7",
    "glyph00009": "8",
    "glyph00010": "9",
  }
  conn = None

  def __init__(self):
    self.session = requests.Session()
    self.session.headers = {
      "user-agent": ua.random
    }
    self.__init__all_city()

  def __init__all_city(self):
    '''獲取所有城市的名字及縮寫的對應關系'''
    api = "https://www.58.com/changecity.html"
    headers = self.session.headers.copy()
    response = self.session.get(api, headers=headers)
    html = response.text
    res = re.findall("cityList = (.*?)", html, re.S)[0]
    res = re.sub("\s", "", res)
    dic = json.loads(res)
    for k, v in dic.items():
      for k1, v1 in v.items():
        dic[k][k1] = v1.split("|")[0]
    city_dict = {}

    def traverse_dict(dic: dict):
      for k, v in dic.items():
        if k == "海外" or k == "其他":
          continue
        if isinstance(v, dict):
          traverse_dict(v)
        city_dict[k] = v

    traverse_dict(dic)

    other_city = re.findall("independentCityList = (.*?)var", html, re.S)[0]
    res = re.sub("\s", "", other_city)
    other_city_dic = json.loads(res)

    for k, v in other_city_dic.items():
      other_city_dic[k] = v.split("|")[0]

    city_dict.update(other_city_dic)
    self.all_city_dict = city_dict

  def spider_zufang(self, city: str = "成都", is_get_all: bool = True):
    '''爬取租房信息的爬蟲方法'''
    assert self.all_city_dict is not None, "獲取所有城市信息失敗 !"
    format_city = self.all_city_dict.pop(city, None)
    assert format_city is not None, "{}該城市不在爬取城市之內".format(city)
    while True:
      self.city = city
      # self.file = open("./house_info.json", "a", encoding="utf-8")
      start_url = self.__init_zufang(format_city)

      # 思路是什么,首先進入區域的租房頁面,在該頁面中先提取出相應的title,比如經紀人,個人房源等等...
      # 我們需要構建出相應的url就可以了
      # start_url的格式為 https://cd.58.com/chuzu/ 我們需要轉為這樣的格式 https://cd.58.com/jintang/hezu/
      # 我們訪問轉化后的地址,再拿去到相應的鏈接,比如經紀人,個人房源等鏈接
      # 拿到該鏈接以后,這就是這個分類里的第一頁url,我們再對這個鏈接發生請求,
      # 拿到響應體,這里可以寫一個while循環,因為我們不知道有多少頁,其實也可以知道有多少頁,就是在這個響應體中可拿到
      # 我的思路就是寫一個while循環,判斷是否有下一頁,有的繼續,沒有的話直接break

      for url_info_list in self.__get_url(start_url):
        # 這里的話,最好進行判斷一下,因為每個title(值個人房源,品牌公寓等..)不一樣的話,可能爬取的策略也不太一樣
        title = url_info_list[1]
        if title in ["個人房源", "安選房源", "經紀人", "熱租房源"] or "出租" in title:
          self.__spiders_v1(url_info_list)
          # pass
        elif title == "品牌公寓":
          self.__spiders_v2(url_info_list)
          pass
        elif title == "房屋求租":
          # 房屋求租不太想寫,數據也不是很多
          pass
        else:
          # 這種情況不在范圍內,直接pass掉
          continue
      if not is_get_all:
        return
      try:
        city = list(self.all_city_dict.keys()).pop()
        format_city = self.all_city_dict.pop(city)
      except IndexError:
        print('全國出租房信息,爬取完畢')
        return

  def spider_ershoufang(self, city: str = "cd"):
    '''爬取二手房信息的爬蟲方法'''
    pass

  def __spiders_v1(self, url_info_list):
    "負責處理個人房源,安選房源等等頁面的方法"
    url = url_info_list[2]
    page_num = 1
    while True:
      time.sleep(2)
      print("正在爬取{}-{}--第{}頁數據".format(url_info_list[0], url_info_list[1], page_num))
      response = self.__get_html_source(url)
      # 從html源碼中獲取到想要的數據
      for house_info_list in self.__deal_with_html_source_v1(response):
        self.__save_to_mysql(house_info_list, url_info_list)
      # 判斷是否還有下一頁
      next_page_url = self.__is_exist_next_page(response)
      if not next_page_url:
        print("{}-{}爬取完畢".format(url_info_list[0], url_info_list[1]))
        return
      url = next_page_url
      page_num += 1

  def __spiders_v2(self, url_info_list):
    '''處理品牌公寓的爬蟲信息'''
    base_url = url_info_list[2]
    format_url = self.__format_url_v2(base_url)
    page_num = 1
    params = None
    while True:
      print("正在爬取{}--第{}頁數據...".format(url_info_list[1], page_num))
      time.sleep(2)
      url = format_url.format(page_num)
      response = self.__get_html_source(url, params)
      # 獲取到有用的數據 deal_with_html_source_v2
      for house_info_list in self.__deal_with_html_source_v2(response):
        # self.__save_to_file_v2(house_info_list)
        self.__save_to_mysql(house_info_list)

      # 獲取到下一頁的encryptData
      encryptData = self.__get_html_encryptData(response)

      # 判斷是否還有下一頁,通過
            
信息不足,為您推薦附近房源
if not self.__is_exist_next_page_v2(response): print("{}爬取完畢".format(url_info_list[1])) return page_num += 1 params = { "encryptData": encryptData or "", "segment": "true" } def __save_to_file_v2(self, house_info_list): ''' :param house_info_list: 關于房子的信息的列表 :param url_info_list: [區域,類型(個人房源,經紀人等等...),url] :return: ''' print("房間圖片地址>>:", file=self.file) print(json.dumps(house_info_list[0], ensure_ascii=False), file=self.file) print("房間描述>>:", file=self.file) print(json.dumps(house_info_list[1], ensure_ascii=False), file=self.file) print("房間詳情>>:", file=self.file) print(json.dumps(house_info_list[2], ensure_ascii=False), file=self.file) print("房間地理位置>>:", file=self.file) print(json.dumps(house_info_list[3], ensure_ascii=False), file=self.file) print("獲取房間的標簽>>:", file=self.file) print(json.dumps(house_info_list[4], ensure_ascii=False), file=self.file) print("獲取房間的價格>>:", file=self.file) print(json.dumps(house_info_list[5], ensure_ascii=False), file=self.file) print(file=self.file) def __save_to_mysql(self, house_info_list, url_info_list=None): '''保存到數據庫''' if not self.conn: self.conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="root", db="city_58") self.conn.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor) if not url_info_list: sql = "insert into zu_house_copy (house_img_url,house_title,house_details,house_address,house_tags,hoouse_price,house_type,city) values (%s,%s,%s,%s,%s,%s,%s,%s)" house_info_list.append("品牌公寓") else: sql = "insert into zu_house_copy (house_img_url,house_title,house_details,house_address,house_tags,hoouse_price,area,house_type,city) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)" house_info_list.append(url_info_list[0]) house_info_list.append(url_info_list[1]) house_info_list.append(self.city) row = self.conn.cursor.execute(sql, house_info_list) if not row: print("插入失敗") else: self.conn.commit() def __deal_with_html_source_v1(self, response): html = response.text self.__get_font_file(html) html = self.__format_html_source(html) for house_info_list in self.__parse_html_v1(html): yield house_info_list def __deal_with_html_source_v2(self, response): html = response.text # 源碼里的關于數字0123456789都是進行處理過的,我們需要先獲取到字體文件 # 我們先獲取到字體文件并且保存 self.__get_font_file(html) # 對源碼中的字體進行處理,得到瀏覽器顯示的數據 html = self.__format_html_source(html) # 開始從頁面中提取出想要的數據 for house_info_list in self.__parse_html_v2(html): yield house_info_list def __parse_html_v1(self, html): xml = etree.HTML(html) li_xpath_list = xml.xpath("http://ul[@class='listUl']/li[@logr]") for li_xpath in li_xpath_list: house_info_list = [] try: house_img_url = li_xpath.xpath("div[@class='img_list']/a/img/@lazy_src")[0] except IndexError: house_img_url = li_xpath.xpath("div[@class='img_list']/a/img/@src")[0] house_info_list.append(house_img_url) # 房間描述 house_title = re.sub("\s", "", li_xpath.xpath("div[@class='des']/h2/a/text()")[0]) house_info_list.append(house_title) # 房間詳情 house_details = re.sub("\s", "", li_xpath.xpath("div[@class='des']/p[@class='room strongbox']/text()")[0].strip()) house_info_list.append(house_details) # 房間地理位置 house_address = re.sub("\s", "", li_xpath.xpath("div[@class='des']/p[@class='add']")[0].xpath("string(.)")) house_info_list.append(house_address) # 獲取房間的標簽 house_tags = "暫無標簽" house_info_list.append(house_tags) # 獲取房間的價格 hoouse_price = re.sub("\s", "", li_xpath.xpath("div[@class='listliright']/div[@class='money']")[0].xpath("string(.)")) house_info_list.append(hoouse_price) yield house_info_list def __parse_html_v2(self, html): '''解析頁面,拿到數據''' xml = etree.HTML(html) li_xpath_list = xml.xpath("http://ul[@class='list']/li") for li_xpath in li_xpath_list: house_info_list = [] # 房間圖片地址,這里只獲取了一張,我在想要不要獲取多張 # 先空著。。。。。。。。。。。。。 house_img_url = li_xpath.xpath("a/div[@class='img']/img/@lazy_src")[0] house_info_list.append(house_img_url) # 房間描述 house_title = li_xpath.xpath("a/div[@class='des strongbox']/h2/text()")[0].strip() house_info_list.append(house_title) # 房間詳情 house_details = re.sub("\s", "", li_xpath.xpath("a/div[@class='des strongbox']/p[@class='room']/text()")[0]) # house_details = li_xpath.xpath("a/div[@class='des strongbox']/p[@class='room']/text()")[0] house_info_list.append(house_details) # 房間地理位置 house_address = re.sub("\s", "", li_xpath.xpath( "a/div[@class='des strongbox']/p[@class='dist']")[0].xpath("string(.)")) or "暫無地址" # house_address = li_xpath.xpath( "a/div[@class='des strongbox']/p[@class='dist']/text()")[0] house_info_list.append(house_address) # 獲取房間的標簽 house_tags = ",".join(li_xpath.xpath("a/div[@class='des strongbox']/p[@class='spec']/span/text()")) house_info_list.append(house_tags) # 獲取房間的價格 hoouse_price = re.sub("\s", "", li_xpath.xpath("a/div[@class='money']/span[@class='strongbox']")[0].xpath( "string(.)")) or "暫無價格" house_info_list.append(hoouse_price) yield house_info_list def __get_font_file(self, html): '''從源碼中獲取到字體文件,并且轉為保存,轉為TTFont對象''' try: b64 = re.findall(r"base64,(.*?)\'", html, re.S)[0] res = base64.b64decode(b64) with open("./online_font.ttf", "wb") as f: f.write(res) self.online_font = TTFont("./online_font.ttf") self.online_font.saveXML("./online.xml") except IndexError: return def __format_html_source(self, html): assert self.online_font, "必須創建字體對象" assert os.path.exists("./online.xml"), "請先獲取到字體文件。" with open("./online.xml", "rb") as f: file_data = f.read() online_uni_list = self.online_font.getGlyphOrder()[1:] file_selector = etree.HTML(file_data) for uni2 in online_uni_list: code = file_selector.xpath("http://cmap//map[@name='{}']/@code".format(uni2))[0] dd = "&#x" + code[2:].lower() + ";" if dd in html: html = html.replace(dd, self.font_dict[uni2]) return html def __format_url_v2(self, url): ''' :param url: https://cd.58.com/pinpaigongyu/?from=58_pc_zf_list_ppgy_tab_ppgy :return: https://cd.58.com/pinpaigongyu/pn/{}/?from=58_pc_zf_list_ppgy_tab_ppgy ''' a = url.split("?") a[0] = a[0] + "pn/{}" format_url = "?".join(a) return format_url def __is_exist_next_page_v2(self, response): xml = self.__response_to_xml(response) try: _ = xml.xpath("http://div[@class='tip']")[0] return False except IndexError: return True def __get_html_encryptData(self, response): html = response.text encryptData = re.findall(r"encryptData\":\"(.*?)\"", html, re.S)[0] return encryptData def __get_url(self, start_url: str): url_set = set() for area, v in self.area_dict.items(): url = self.__conversion_url(start_url, v) response = self.__get_html_source(url) title_dict = self.__get_title_info(response) for title_name, v in title_dict.items(): # 對于求租、品牌公寓這個url,它是重復的,在這里進行判斷判斷就好了 if v in url_set: continue else: url_set.add(v) yield [area, title_name, v] def __conversion_url(self, url: str, area: str): ''' :param url: https://cd.58.com/chuzu/ :param area: :return: https://cd.58.com/區域縮寫/chuzu/ ''' lis = url.split("/") lis.insert(3, area) return "/".join(lis) def __init_zufang(self, format_city): '''首先將所需要的數據的獲取到''' start_url = "https://{}.58.com/chuzu/".format(format_city) headers = self.session.headers.copy() response = self.session.get(url=start_url, headers=headers) self.__get_area_info(response) return start_url def __get_html_source(self, url, params=None): '''通過get方式獲取到網頁的源碼''' time.sleep(1) headers = self.session.headers.copy() try: if not params: params = {} response = self.session.get(url=url, headers=headers, params=params) return response except Exception as e: with open("./url_log_error.txt", "a", encoding="utf-8") as f: f.write(str(datetime.datetime.now()) + "\n") f.write(str(e) + "\n") f.write("error_url>>:{}".format(url) + "\n") def __response_to_xml(self, response): try: xml = etree.HTML(response.text) return xml except AttributeError: raise CustomException(10000, "response對象轉換為xml失敗,錯誤的鏈接地址為>>:{}".format(response)) def __is_exist_next_page(self, response): '''判斷是否存在下一頁,存在拿到下一頁的鏈接,不存在返回False''' xml = self.__response_to_xml(response) try: next_page_url = xml.xpath("http://a[@class='next']/@href")[0] return next_page_url except IndexError: return False def __get_area_info(self, response): '''獲取到當前城市的區域''' xml = self.__response_to_xml(response) a_xpath_list = xml.xpath("http://dl[@class='secitem secitem_fist']//a[not(@class)]") area_key_list = [] area_value_list = [] for a_xpath in a_xpath_list: area_key_list.append(a_xpath.xpath("text()")[0]) area_value_list.append(re.findall("com/(.*?)/", a_xpath.xpath("@href")[0])[0]) assert len(area_key_list) == len(area_value_list), "數據不完整" self.area_dict = {k: v for k, v in zip(area_key_list, area_value_list)} def __get_title_info(self, response): '''獲取房屋的分類,比如個人房源,合租房,經紀人,熱選房源...''' "listTitle" xml = self.__response_to_xml(response) a_xpath_list = xml.xpath("http://div[@class='listTitle']//a[not(@class)]") title_key_list = [] title_value_list = [] for a_xpath in a_xpath_list: title_key_list.append(a_xpath.xpath("span/text()")[0]) title_value_list.append(a_xpath.xpath("@href")[0]) assert len(title_key_list) == len(title_value_list), "數據不完整" return {k: v for k, v in zip(title_key_list, title_value_list)} if __name__ == '__main__': city_58 = City_58() city_58.spider_zufang("重慶")

附上數據庫爬取的結果

python爬蟲 爬取58同城上所有城市的租房信息詳解_第1張圖片

python爬蟲 爬取58同城上所有城市的租房信息詳解_第2張圖片

以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支持腳本之家。


更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號聯系: 360901061

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對您有幫助就好】

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長會非常 感謝您的哦!!!

發表我的評論
最新評論 總共0條評論
主站蜘蛛池模板: 中文字幕一区二区三区精彩视频 | 精品国产一区二区三区www | 久草在线视频免费播放 | 牛牛影视成人午夜影视 | 天天干天天做 | 黄毛片网站 | 玖玖国产在线观看 | 久久免费精品 | 完整日本特级毛片 | 欧美猛操 | 亚洲色婷婷综合开心网 | 国产一区二区三区日韩欧美 | 亚洲国产欧美一区 | 国产精品在线播放 | 久久是精品 | 99久久免费国产香蕉麻豆 | 国产一级毛片一区二区三区 | 国产91免费在线观看 | 亚洲成在人 | 99久久国产综合色 | 精品国产精品国产偷麻豆 | 亚洲欧美日韩国产综合高清 | 老司机伊人 | 欧美成人全部费免网站 | 6一10周岁毛片免费 6一10周岁毛片在线 | 久久鸭综合久久国产 | 一级毛片特级毛片黄毛片 | 免费看成人毛片日本久久 | 国产福利在线观看精品 | 欧美亚洲国产一级毛片 | 成人在线播放 | 四虎免费影院4hu永久免费 | 狠狠色噜噜狠狠米奇777 | 天天干夜夜曰 | www.亚洲天堂 | 国产欧美成人一区二区三区 | 色黄啪啪网18以下勿入 | 成人国产第一区在线观看 | 一级黄色录像毛片 | 欧美色亚洲图 | 国产精品免费播放 |