《Python3爬蟲、數(shù)據(jù)清洗和可視化實戰(zhàn)》
零一 韓要賓 黃園園 著
第十章 綜合應用實例
實例:按性價比給用戶推薦旅游產品
第一部分:數(shù)據(jù)采集
import requests
import json
import urllib. Request
import time
import csv
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver,support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
def globalVals():
global driver
global driver_
driver = webdriver.Chrome()
driver_ = webdriver.Chrome ()
def init_ csv() :
global f
global writer
csvFile = "D:/qunar_routes.csv"
#打幵文件后如果亂碼,則將utf-8改成gb18030
f = open(csvFile, "w",newline="", encoding='utf-8')
writer = csv.writer(f)
writer.writerow(["出父地","目的地","路踐信息","酒店信息"])
def close_csv() :
global f
f.close()
def dump_routes_csv(dep,arr):
global driver
global driver_
global writer
#定位所有路銭信息
routes = driver.find_elemerts_by_css_selector(".item.g-flexbox.list-item")
for route in routes:
try:
print ("\nroute info:%s" % route.text)
#獲取路線詳細頁URL
url = route.get_attribute ("data-ur1")
print ("url:%s" % url)
#在另一個瀏覽器對象打開路線詳情頁
driver_.get(url)
time.sleep(random.uniform(2, 3))
if "fhtouch" in url: #機酒自由行
try:
# we have to wait for the page to refresh
WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”#allHotels”)))
Source=diver_.find_element_css_selector(‘#main-page’)
target=diver_.find_element_css_selector(‘#allHotels’)
except:
print (str(e))
continue
else: #自由行
try:
#等待頁面刷新成功
WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”.m-ball.m-ball-back”)))
Source=diver_.find_element_css_selector(‘.flex.scrollable’)
target=diver_.find_element_css_selector(‘.m-ball.m-ball-back’)
except:
print (str(e))
continue
#路線詳情頁需須通過drag_and_drop動作獲得焦點,否則[rage Down]鍵無效 ActionChains(driver_).drag_and_drop(source, target).perform()
for i in tange(3):
#模擬[Page Down]鍵的輸入,實現(xiàn)下拉滾動條動作 (3次)
ActionChains(driver_).send_keys (Keys.PAGE_DOWN).perform()
#路線詳情頁下拉滾動條后才可定位到下面的元素
try:
# we have to wait for the page to refresh
WebDriverWait(driver_,10).until(EC.presence_of_elenent_located(By.css_SELECTOR,".tit .score")))
except Exception as e:
print(str(e))
continue
try:
#獲取酒店評分
rating = driver_.find_element_by_cas_selector(“.tit .score"
#獲取酒店類型
type=driver.find_element_by_css_selector(".tit+ .tag-list > .g-tag.solid")
#拼接成酒店信息
hotel = '\n'.join([rating.text, type.text])
print ("hotel info:%s" % hotel)
except Exception as e:
print (str(e))
continue
#將這一條路線信息寫入CSV文件
writet.writerow([dep, arr, route.text, hotel])
except:
continue
if __name__ == "__main_":
globalVals()
init_csv()
dep.cities = [“杭州"]
for ecp in dep cities:
strhtmI = requests.get('https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep=' + urllib.request.quote(dep) + '&exclude=&extensionImg=255, 175’)
arrive_dict = json.loads(strhtml.text)
for arr_item in arrive_dict['data']:
#本例只爬取國內自由行路線,如需爬取國際路線,可將下面兩行注釋掉
if acr_item['title'] != "國內":
continue
for arr_item_1 in arr_item[ 'subModules'] :
for guery in arr_item_1['items'] :
#本例只爬取杭州-麗江的自由行路線,如需爬取杭州-全國路線,注釋下面兩行
if query['query'] != "麗江":
continue
#打幵移動端自由行路線捜索結果頁面
driver.get ("https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep=" + urllib. request.quote(dep) + "&query=" + urlib.request.quote(query['query']) + "%e8%87%aa%e7%94%b%e8%a1%8c%it=n_index_free"l
try:
#we have to wait for the page to refresh
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"item g-flexbox list-item ")))
except Exception as e:
print(str(e))
raise
print("dep:%s arr:%s" % (dep, query["query"]))
#連續(xù)下拉滾動條50次獲取更多的信息
for I in range(50):
time.sleep(random.uniform(2, 3))
print("page %d" % (i+1))
#模擬動作實現(xiàn)下拉
ActionChains(driver).send_keys (Keys.PAGE_DOWN).perform()
#將出發(fā)地-目的地的自由行路線寫入С?V 文件
dump_ routes_csv (dep, query["query"])
close_csv()
driver.close()
driver_.close()
第二部分:數(shù)據(jù)清洗、建模
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api assm
#讀取路線信息csv文件
df = pd.read_csv("D: /qunar_coutes.csv")
print(df.head())
print(df.info())
#從路線信息中提取天數(shù)、價格信息
df[“天數(shù)”]=df.路線信息.str.extract(‘(d+)天\d+晚’).apply(lambda x: int(x))
df["價格"]=df.路線信息.str.extract('(\d+)起/人').apply(lambda x: int(x))
#從酒店信息中提取評分、等級信息
df["酒店評分"]=df.酒店信息,str.extract('(\d\.\d)分').apply(lambda x: float(x))
df["酒店等級"]=df.酒店信息.str.extract('\n(.*)')
print (df.head())
print (df.info() )
#將酒店等級信息由文本型映射成數(shù)值型
class map = {"其他":0, "經濟型":1, "舒適型":2, "高檔型":3, "豪華型":4}
df["酒店等級"]=df["酒店等級"].map (class_map)
#對變量畫直方圖,查看是否有異常值
fig, axes = plt.subplots(1,3,figsize=(12,4))
df["酒店等級"].plot (ax=axes[0],kind='hist',title="酒店等級")
df["酒店評分"].plot(ax=axes[1], kind='hist',title="酒店評分")
df["價格"].plot (ax=axes[2],kind='hist', title="價格")
#提取自變量X,因變量y
X,y = df.ix[:,4:-1].values,df.ix[:,-1].values
#擬合OLS線性回歸模型
ols = sm.OLS (y,X)
result = ols.fit()
#查看擬合效果,R=0.886
print (result. summary())
#用訓練好的線性回歸模型來預測路線價格
y_pred = result.predict (X)
#性價比定義為預測價格和實際價格的比值
ratio = y_pred/y
df["性價比"] = ratio
#按性價比從高到低排序
print(df.sort_values ("性價比",ascending=False))
(
"Don't waste your time looking back, you're not going that way."--《Vikings》
)
更多文章、技術交流、商務合作、聯(lián)系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
