基于Python requests的人人詞典數據爬蟲,爬取站點http://www.91dict.com
爬取內容包含:單詞、單詞詞性及翻譯、單詞發音、單詞例句劇照、單詞例句及翻譯、單詞例句發音
總共數據: 單詞53189個,例句發音文件及圖片文件共10G左右,20M帶寬不到一個小時就能爬完,我測試是這樣的。。。
關于單詞發音,可自行添加爬取
爬取內容1
爬取內容2
例句發音
例句劇照
Python版本
Python3+ ,建議Python3.6
requirements.txt
requests==2.21.0
lxml==4.3.3
運行方式
clone 后windows運行run.bat,linux運行run.sh
目錄結構
|---------------------------------------------------------------------------------------------------------------------
|--audio 單詞音頻文件,項目中為部分文件,僅供查看
|--pic 單詞圖片文件,項目中為部分文件,僅供查看
|--words 拆分的原始單詞數據,每個文件5000個單詞,爬蟲為每個文件創建一個進程來爬取
|--result_demo 單詞數據結果demo,非完整數據
|--allWords.json 所有的單詞,一行一個單詞,共53189個單詞,爬蟲根據此爬取單詞數據
|--combain.py 合并最后的結果,即合并words目錄下的json文件
|--requirements.txt Python依賴模塊
|--run.bat Windows啟動腳本
|--run.sh Linux啟動腳本
|--scrapy.py 爬蟲腳本
|--split.py 拆分原始單詞數據,及allWords.json文件中的單詞
|----------------------------------------------------------------------------------------------------------------------
?
核心代碼
split.py(數據拆分)
#!usr/bin/env python3
def read_file(file_name):
with open(file_name, 'r', encoding='utf-8') as f:
for line in f:
yield line
def save(file_name, data):
with open(file_name, 'a', encoding='utf-8') as f:
f.write(data)
f.close()
i, j = 0, 0
for line in read_file('allWords.json'):
if i % 5000 == 0:
j += 1
print('第' + str(i) + '次')
save('./words/' + str(j) + '.txt', line)
i += 1
scrapy.py(數據抓取)
#!/usr/bin/env python3
import requests
import json
import os
import uuid
from lxml import etree
from multiprocessing import Process
class ScrapyProcess(Process):
def __init__(self, file_name):
super(ScrapyProcess, self).__init__()
self.file_name = file_name
def read_file(self):
with open(self.file_name + '.txt', 'r', encoding='utf-8') as f:
for line in f:
yield line[:-1]
def download_file(self, url, path):
res = requests.get(url)
with open(path, 'wb') as f:
f.write(res.content)
def connect_file(self, file_name1, file_name2, file_name3):
file1 = open(file_name1, 'rb')
file2 = open(file_name2, 'rb')
file3 = open(file_name3, 'wb')
file3.write(file1.read())
file3.write(file2.read())
file1.close()
file2.close()
file3.flush()
file3.close()
os.remove(file_name1)
os.remove(file_name2)
def is_in(self, key, dict_list):
for item in dict_list:
if key in item.keys():
return True
return False
def scrapy(self, word):
word_info = {}
url = 'http://www.91dict.com/words?w=' + word
res = requests.get(url)
res.encoding = 'utf-8'
data = etree.HTML(res.text)
if data.xpath('/html/body/div[2]/section[2]/div/div/div/div[1]/div[1]/p/text()'):
# 單詞
word_info['word'] = data.xpath(
'/html/body/div[2]/section[2]/div/div/div/div[1]/div[1]/p/text()')[0]
word_info['am_phonetic'] = '//'
word_info['en_phonetic'] = '//'
# print(data.xpath("http://*[@class='vos']/span[1]/text()"))
# print(data.xpath("http://*[@class='vos']/span[2]/text()"))
if list(filter(lambda x: x != '\n', data.xpath("http://*[@class='vos']/span[1]/text()"))):
word_info['en_phonetic'] = list(filter(lambda x: x != '\n', data.xpath(
"http://*[@class='vos']/span[1]/text()")))[0].replace('\n', '')[1:].replace('[', "/").replace(']', '/')
if list(filter(lambda x: x != '\n', data.xpath("http://*[@class='vos']/span[2]/text()"))):
word_info['am_phonetic'] = list(filter(lambda x: x != '\n', data.xpath(
"http://*[@class='vos']/span[2]/text()")))[0].replace('\n', '')[1:].replace('[', "/").replace(']', '/')
# 翻譯
train = []
for item in filter(lambda x: x != '', map(lambda x: x.replace('\n', ''),
data.xpath("http://*[@class='listBox']/text()"))):
if len(item.split('. ')) == 1:
train.append({'': item.split('. ')[0]})
elif len(item.split('. ')) == 2 and not item.startswith('=') and not self.is_in(item.split('. ')[0], train):
train.append({item.split('. ')[0]: item.split('. ')[1]})
word_info['tran'] = train
# 例子
example = []
example_len = len(data.xpath(
"http://*[@class='flexslider flexslider_2']/ul/li/div[@class='imgMainbox']"))
# 例句
sens = data.xpath("http://*[@class='mBottom']")
# 例句范意思
sen_trains = data.xpath("http://*[@class='mFoot']/text()")
origins = list(filter(lambda x: x != '\n', data.xpath(
"http://*[@class='mTop']/text()")))
# 下文內容及翻譯
next_sens = data.xpath(
"http://*[@class='mTextend']/div[2]/div[2]/p[1]/text()")
next_sen_trains = data.xpath(
"http://*[@class='mTextend']/div[2]/div[2]/p[2]/text()")
pic_urls = data.xpath(
"http://*[@class='flexslider flexslider_2']/ul/li/div[@class='imgMainbox']/img/@src")
pron_urls = data.xpath(
"http://*[@class='flexslider flexslider_2']/ul/li/div[@class='imgMainbox']/div/div/audio/@src")
next_pron_urls = data.xpath("http://*[@class='viewdetail']/@href")
for i in range(example_len):
sen = etree.tostring(
sens[i], encoding='utf-8')[22:-7].decode('utf-8')
sen_train = sen_trains[i][1:]
# 圖片
pic_url = './pic/%s-%d.jpg' % (word_info['word'], i)
pron_url = './audio/%s-%d.mp3' % (word_info['word'], i)
self.download_file(pic_urls[i], pic_url)
# 如果句子沒有完,需要拼接句子并合成語音
if not sen.endswith('.') and not sen.endswith(';') and not sen.endswith('?') and not sen.endswith('!'):
if sen[-1] != ',':
sen += ','
sen_train += ','
if i < len(next_sens) and i < len(next_sen_trains):
# 例句
sen += next_sens[i]
# 翻譯
sen_train += next_sen_trains[i]
# 語音1
pron_url_1 = './audio/%s-%d-1.mp3' % (
word_info['word'], i)
# 語音2
pron_url_2 = './audio/%s-%d-2.mp3' % (
word_info['word'], i)
temp = requests.get(
'http://www.91dict.com' + next_pron_urls[i]).text
temp_data = etree.HTML(temp)
self.download_file(pron_urls[i], pron_url_1)
for li in temp_data.xpath("http://*[@class='item']/li"):
if li.xpath("./div[@class='mBottom']/text()")[0].replace('\n', '') == next_sens[i]:
self.download_file(
li.xpath("./div[@class='mTop']/audio/@src")[0], pron_url_2)
break
self.connect_file(pron_url_1, pron_url_2, pron_url)
else:
# 直接下載語音
self.download_file(pron_urls[i], pron_url)
example.append({
'origin': origins[i][1:-1],
"sen": sen,
'sen_tran': sen_train,
'pic_url': pic_url,
'pron_url': pron_url
})
word_info['example'] = example
return word_info
def main(self):
for word in self.read_file():
print(word)
self.save(self.scrapy(word))
def save(self, word_info):
with open(self.file_name + '.json', 'a', encoding='utf-8') as f:
if word_info:
json.dump(word_info, fp=f, indent=4, ensure_ascii=False)
f.write(',\n')
def run(self):
self.main()
if __name__ == "__main__":
for i in range(1, 12):
p = ScrapyProcess('./words/' + str(i))
# 啟動子進程
p.start()
combain.py(數據合并)
#!/usr/bin/env python3
def read_file(file_name):
with open(file_name, 'r', encoding='utf-8') as f:
for line in f:
yield line
def save(file_name, data):
with open(file_name, 'a', encoding='utf-8') as f:
f.write(data)
f.close()
for i in range(1, 12):
print('當前:' + str(i))
for line in read_file('./words/' + str(i) + '.json'):
save('單詞數據.json', line)
單詞數據結果demo
{
//單詞
"word": "sir",
//美式發音
"am_phonetic": "/s?/",
//英式發音
"en_phonetic": "/s??/",
#詞性及翻譯
"tran": [
{
"n": "先生;(用于姓名前)爵士;閣下;(中小學生對男教師的稱呼)先生;老師"
}
],
#例句
"example": [
{
//例句出處
"origin": "來自《一位年輕醫生的筆記 第1季 第2集》",
//例句,如果例句結尾符號不是.;?!,會在結尾加上逗號拼接下文
"sen": "It was me,
sir
and no one else,
sir
.",
//例句翻譯,同上也會拼接
"sen_tran": "我一個人喝掉了 醫生",
//例句圖片
"pic_url": "./pic/sir-0.jpg",
//例句發音文件地址,如果例句結尾符號不是.;?!, 會拼接兩個句子的語音文件,合成為一個
"pron_url": "./audio/sir-0.mp3"
},
{
"origin": "來自《拆彈部隊》",
"sen": "No,
sir
,
sir
, that's sergeant James. He's right here.",
"sen_tran": "不 長官 是詹姆斯中士 他就在那里",
"pic_url": "./pic/sir-1.jpg",
"pron_url": "./audio/sir-1.mp3"
},
{
"origin": "來自《雷斯特雷波》",
"sen": "
Sir
. How you doing,
sir
? Good to see you again.",
"sen_tran": "長官 還好嗎 很高興再見到您",
"pic_url": "./pic/sir-2.jpg",
"pron_url": "./audio/sir-2.mp3"
},
{
"origin": "來自《太空堡壘卡拉狄加 第4季 第12集》",
"sen": "Yes,
sir
. I'm sorry,
sir
, but what can I do?",
"sen_tran": "是 長官 我很抱歉 可我能怎么辦?",
"pic_url": "./pic/sir-3.jpg",
"pron_url": "./audio/sir-3.mp3"
},
{
"origin": "來自《太空堡壘卡拉狄加 第2季 第12集》",
"sen": "Don't worry,
sir
. I'll take it real slow,
sir
.",
"sen_tran": "別擔心 長官 我們會慢慢來的 長官!",
"pic_url": "./pic/sir-4.jpg",
"pron_url": "./audio/sir-4.mp3"
},
{
"origin": "來自《耶魯大學開放課程:歐洲文明》",
"sen": "And he replied,
Sir
, I pedal so quickly,they'll never catch me.",
"sen_tran": "他回答道 先生 我踩踏板很快,他們永遠也追不上我",
"pic_url": "./pic/sir-5.jpg",
"pron_url": "./audio/sir-5.mp3"
}
]
}
代碼很爛,請見諒,git地址:https://github.com/RickyHal/91dict_scrapy
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
