亚洲免费在线-亚洲免费在线播放-亚洲免费在线观看-亚洲免费在线观看视频-亚洲免费在线看-亚洲免费在线视频

Python爬取讀者并制作成PDF

系統 1591 0

學了下beautifulsoup后,做個個網絡爬蟲,爬取讀者雜志并用reportlab制作成pdf..

crawler.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? getmain.py
??? Last modified:? 2015-02-19 16:47
??? E-mail:???????? anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
#??? response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
??? response = urllib2.urlopen(url)
??? html = response.read()
??? soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
??? #for i in soup.find_all('div'):
??? #??? print i,1
??? title=soup.find("h1").string
??? writer=soup.find(id="pub_date").string.strip()
??? _from=soup.find(id="media_name").string.strip()
??? text=soup.get_text()#.encode("utf-8")
??? main=re.split("BAIDU_CLB.*;",text)
??? result={"title":title,"writer":writer,"from":_from,"context":main[1]}
??? return result
??? #new=open("new.txt","w")
??? #new.write(result["title"]+"\n\n")
??? #new.write(result["writer"]+"? "+result["from"])
??? #new.write(result["context"])
??? #new.close()
def getCatalog(issue):
??? url=" http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
??? firstUrl=url+"duzh"+issue+"01.html"
??? firstUrl=url+"index.html"
??? duzhe=dict()
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup=BeautifulSoup(html)
??? firstUrl=url+soup.table.a.get("href")
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup = BeautifulSoup(html)
??? all=soup.find_all("h2")
??? for i in all:
??????? print i.string
??????? duzhe[i.string]=list()
??????? for link in i.parent.find_all("a"):
??????????? href=url+link.get("href")
??????????? print href
??????????? while 1:
??????????????? try:
??????????????????? article=getEachArticle(href)
??????????????????? break
??????????????? except:
??????????????????? continue
??????????? duzhe[i.string].append(article)
??? return duzhe
def readDuZhe(duzhe):
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? print eachArticle["title"]
if __name__ == '__main__':
#??? issue=raw_input("issue(201501):")
??? readDuZhe(getCatalog("201424"))

getpdf.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? writetopdf.py
??? Last modified:? 2015-02-20 19:19
??? E-mail:???????? anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
??? reportlab.rl_config.warnOnMissingFontGlyphs = 0
??? pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
??? pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
??? fonts.addMapping('song', 0, 0, 'song')
??? fonts.addMapping('song', 0, 1, 'song')
??? fonts.addMapping('song', 1, 0, 'hei')
??? fonts.addMapping('song', 1, 1, 'hei')
??? stylesheet=getSampleStyleSheet()
??? normalStyle = copy.deepcopy(stylesheet['Normal'])
??? normalStyle.fontName ='song'
??? normalStyle.fontSize = 11
??? normalStyle.leading = 11
??? normalStyle.firstLineIndent = 20
??? titleStyle = copy.deepcopy(stylesheet['Normal'])
??? titleStyle.fontName ='song'
??? titleStyle.fontSize = 15
??? titleStyle.leading = 20
??? firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
??? firstTitleStyle.fontName ='song'
??? firstTitleStyle.fontSize = 20
??? firstTitleStyle.leading = 20
??? firstTitleStyle.firstLineIndent = 50
??? smallStyle = copy.deepcopy(stylesheet['Normal'])
??? smallStyle.fontName ='song'
??? smallStyle.fontSize = 8
??? smallStyle.leading = 8
??? story = []
??? story.append(Paragraph(" 讀者{0}期 ".format(issue), firstTitleStyle))
??? for eachColumn in duzhe:
??????? story.append(Paragraph('__'*28, titleStyle))
??????? story.append(Paragraph(' {0} '.format(eachColumn), titleStyle))
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(eachArticle["title"],normalStyle))
??? story.append(flowables.PageBreak())
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(" {0} ".format(eachArticle["title"]),titleStyle))
??????????? story.append(Paragraph(" {0}? {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
??????????? para=eachArticle["context"].split("  ")
??????????? for eachPara in para:
??????????????? story.append(Paragraph(eachPara,normalStyle))
??????????? story.append(flowables.PageBreak())
??? #story.append(Paragraph("context",normalStyle))
??? doc = SimpleDocTemplate("duzhe"+issue+".pdf")
??? print "Writing PDF..."
??? doc.build(story)
def main(issue):
??? duzhe=crawler.getCatalog(issue)
??? writePDF(issue,duzhe)
if __name__ == '__main__':
??? issue=raw_input("Enter issue(201501):")
??? main(issue)

以上就是本文的全部內容了,希望大家能夠喜歡。


更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號聯系: 360901061

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對您有幫助就好】

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長會非常 感謝您的哦!!!

發表我的評論
最新評論 總共0條評論
主站蜘蛛池模板: 亚洲精品自拍视频 | 一区二区三区免费精品视频 | 免费观看一级特黄三大片视频 | 日韩有码在线视频 | 日本一级毛片免费 | 免费观看a黄一级视频 | 性xxx免费视频 | 国产亚洲精品一区二区在线播放 | 国产精品一久久香蕉产线看 | 久久精品日日躁夜夜躁欧美 | 免费观看四虎精品成人 | 国产高清一区二区三区视频 | 青青草国产三级精品三级 | 日韩一区二区三区在线观看 | 久久99精品久久久久久国产人妖 | 五月花在线观看播放视频 | 99精品免费 | 亚洲欧美激情精品一区二区 | 毛片线看免费观看 | 国产精品原创巨作无遮挡 | 亚洲青草视频 | 999精品影视在线观看 | 午夜在线影院 | 国产精品亚洲欧美日韩一区在线 | 综合网在线视频 | 日韩在线a视频免费播放 | 五月天久久综合 | 91中文字幕 | 久久爱www人成| 四虎影视永久在线观看 | 亚洲精品第一页中文字幕 | 国产精品第二页 | 成人a毛片免费视频观看 | 久久99蜜桃精品久久久久小说 | 色色网视频 | 91精彩视频 | 亚洲免费a | 国产极品福利 | 97国产在线视频 | 亚洲国产精久久久久久久春色 | 色狠狠狠色噜噜噜综合网 |