改一下用戶名和密碼可以直接用(要開通VIP才能抓VIP的章節(jié)),代碼如下:
# -*- coding: utf-8 -*-
# @Time : 2019/5/19 17:53
# @Author : LM
import requests
from bs4 import BeautifulSoup
import json
import os
class Spider(object):
def __init__(self):
self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
print('請輸入要爬取的小說名')
self.novel_name = input()
self.chapters_url = ''
self.download_url_ls = {}
self.loginName = ''
self.password = ''
def login(self):
data = {
'loginName': self.loginName,
'password': self.password
}
login_url = 'https://passport.17k.com/ck/user/login'
session = requests.Session()
html = session.post(url=login_url, data=data, headers=self.headers).text
json_data = json.loads(html)
if json_data['status']['msg'] == 'succ':
print('登陸成功,當(dāng)前賬戶:{}'.format(self.loginName))
return session
else:
print('用戶名或者密碼錯誤')
def catalogue(self):
url = 'https://search.17k.com/search.xhtml?c.st=0&c.q=' + self.novel_name
html = requests.get(url=url, headers=self.headers).text
soup = BeautifulSoup(html, 'lxml')
res = soup.findAll(attrs={'class': 'textmiddle'})
for i in res:
info = i.select(' dl > dt > a')[0]
searched_name = info.get_text().strip()
if searched_name == self.novel_name:
print('{}:查找成功'.format(self.novel_name))
self.chapters_url = 'https://' + info.get('href')[2:].replace('book', 'list')
break
print('查找失敗,不存在該小說或拼寫錯誤')
def get_download_url(self):
html = requests.get(url=self.chapters_url, headers=self.headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
volume = soup.find_all('dl', class_='Volume')
for c in volume:
chapters_res = BeautifulSoup(str(c), 'lxml')
all_chapters = chapters_res.find_all('a', target='_blank')
for each_chapter in all_chapters:
download_url = each_chapter.get('href').strip()
is_vip = each_chapter.find_all('span')[0].get('class')[1]
self.download_url_ls['{}'.format(download_url)] = is_vip
def download_content(self):
session = self.login()
if os.path.exists('./{}.txt'.format(self.novel_name)):
os.remove('./{}.txt'.format(self.novel_name))
print('小說存在,已刪除')
print('開始爬取小說:{}'.format(self.novel_name))
for u in self.download_url_ls.keys():
if self.download_url_ls[u] == '':
url = 'https://www.17k.com' + u
html = requests.get(url=url, headers=self.headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
read_area = soup.find_all('div', class_='readAreaBox content')[0]
title = read_area.select('h1')[0].get_text()
print('正在爬取章節(jié):{}'.format(title))
content = ''
for c in read_area.select('p'):
content += ' ' + c.get_text() + '\n'
else:
book_id = u.split('/')[2]
chapter_id = u.split('/')[3].split('.')[0]
url = 'https://www.17k.com/ck/book/{}/chapter/{}?subAllPrice=1&appKey=2406394919'.format(book_id, chapter_id)
html = session.get(url=url, headers=self.headers).text
#print(html)
json_data = json.loads(html)
title = json_data['data']['name']
print('正在爬取章節(jié):{}'.format(title))
content = ' ' + json_data['data']['content'][0]['text'].replace('\r', '')
with open('./{}.txt'.format(self.novel_name), 'a', encoding='utf-8') as f:
f.write(title + '\n\n')
f.write(content)
f.write('\n\n')
print('爬取完成。')
if __name__ == "__main__":
s = Spider()
s.catalogue()
s.get_download_url()
s.download_content()
更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機(jī)微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
