這里主要講了bs4解析方法和json方法,以8684網頁為例子,爬取了全國公交線路
import requests
import time
from bs4 import BeautifulSoup
import json
from xpinyin import Pinyin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
# 一級頁面
def first_get(url, fp):
# 訪問頁面
first = requests.get(url=url, headers=headers)
# 生成soup對象
soup = BeautifulSoup(first.text, 'lxml')
# 先獲取數字一欄
number_list = soup.select('.bus_kt_r1 > a')
# 獲取拼音一欄
char_list = soup.select('.bus_kt_r2 > a')
# 將數字和拼音拼接到一塊
all_list = number_list + char_list
# 拼接二級url
for href in all_list:
two_url = url.rstrip('/') + href['href']
# 進入二級頁面
two_get(two_url, fp, url)
# 二級頁面
def two_get(two_url, fp, url):
# 訪問頁面
second = requests.get(url=two_url, headers=headers)
# 生成soup對象
soup = BeautifulSoup(second.text, 'lxml')
# 獲取公交編號
bus_list = soup.select('.stie_list > a')
# 拼接三級url
for href in bus_list:
three_url = url.strip('/') + href['href']
# 進入三級頁面
three_get(three_url, fp)
def three_get(three_url, fp):
# 訪問頁面
three = requests.get(url=three_url, headers=headers)
# 生成soup對象
soup = BeautifulSoup(three.text, 'lxml')
# 線路名稱
way_name = soup.select('.bus_i_t1 > h1')[0].text.strip('?')
# 運行時間
bus_time = soup.select('.bus_i_content > .bus_i_t4')[0].text.strip('運行時間:')
# 票價信息
bus_price = soup.select('.bus_i_content > .bus_i_t4')[1].text.strip('票價信息:')
# 公交公司
bus_company = soup.select('.bus_i_content > .bus_i_t4 > a')[0].text
# 上行總站數
up_number = soup.select('.bus_line_top > .bus_line_no')[0].text.strip('共站').strip()
# 獲取上行總站牌
up_number_name1 = soup.select('.bus_site_layer')
up_number_name_list = []
for i in range(0, len(up_number_name1) // 2):
up_number_name2 = up_number_name1[i].select('a')
# 將站牌存入list列表中
for name in up_number_name2:
up_number_name_list.append(name.text)
try:
# 下行總站數
down_number = soup.select('.bus_line_top > .bus_line_no')[1].text.strip('共站').strip()
# 獲取下行總站牌
down_number_name1 = soup.select(' .bus_site_layer')
down_number_name_list = []
for j in range(len(up_number_name1) // 2, len(up_number_name1)):
down_number_name2 = down_number_name1[j].select('a')
# 將站牌存入list列表中
for name in down_number_name2:
down_number_name_list.append(name.text)
except Exception as s:
down_number = '無下行線路'
down_number_name_list = []
print("正在抓取%s......" % way_name)
item = {
'線路名稱': way_name,
'運行時間': bus_time,
'票價信息': bus_price,
'公交公司': bus_company,
'上行總站數': up_number,
'獲取上行總站牌': up_number_name_list,
'下行總站數': down_number,
'獲取下行總站牌': down_number_name_list
}
string = json.dumps(item, ensure_ascii=False)
fp.write(string + '\n')
print("結束抓取%s......" % way_name)
time.sleep(2)
def main():
fp = open('西安公交.txt', 'w', encoding='utf8')
url = 'https://{}.8684.cn/'
pin = Pinyin()
string = pin.get_pinyin(input("輸入所要查詢的城市:"))
city_name = string.split('-')[0] + string.split('-')[1]
url = url.format(city_name)
# 進入一級頁面
first_get(url, fp)
if __name__ == '__main__':
main()
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
