先貼連接,讓各位觀眾老爺看看,對不對你們的胃口
工控行業系統漏洞
可以看到,這個網頁是html靜態的,所以問題變的非常的簡單
只需要用request請求網頁就可以了
話不多說,直接貼代碼
import requests
from urllib.parse import urlencode
from lxml import etree
import pymysql
import time
import xlwt
import xlrd
def makeurl():
# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
baseurl = 'http://ics.cnvd.org.cn/?'
params = {
'tdsourcetag': 's_pctim_aiomsg',
'max': '20'
}
for page in range(MAX_PAGE):
params['offset'] = page * 20
url = baseurl + urlencode(params)
print('url is ', url)
yield url
def get_page_urllist(url):
headers = {
'Host': 'ics.cnvd.org.cn',
'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def parse_urllist(content):
html = etree.HTML(content)
for li in html.xpath('//tbody[@id="tr"]/tr'):
yield li.xpath('td/a/@href')[0]
def get_page(url):
headers = {
'Host': 'www.cnvd.org.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
def parse_page(content, url):
html = etree.HTML(content)
item = {}
item['url'] = url
item['標題'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0])
item['CNVD_ID'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')])
item['公開日期'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="公開日期"]/following-sibling::*[1]//text()')])
item['危害級別'] = ''.join([i.strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') for i in
html.xpath('//tbody/tr/td[text()="危害級別"]/following-sibling::*[1]//text()')])
item['影響產品'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="影響產品"]/following-sibling::*[1]//text()')])
try:
item['BUGTRAQ_ID'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')])
except:
item['BUGTRAQ_ID'] = ''
item['CVE_ID'] = ''.join(
[i.strip() for i in
html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + ' ' + ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')])
item['漏洞描述'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞描述"]/following-sibling::*[1]//text()')])
item['漏洞類型'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞類型"]/following-sibling::*[1]//text()')])
item['參考鏈接'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="參考鏈接"]/following-sibling::*[1]//text()')])
item['漏洞解決方案'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞解決方案"]/following-sibling::*[1]//text()')])
item['廠商補丁'] = ''.join(
[i.strip() for i in html.xpath(
'//tbody/tr/td[text()="廠商補丁"]/following-sibling::*[1]//text()')]) + ' http://www.cnvd.org.cn' + ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="廠商補丁"]/following-sibling::*[1]//@href')])
item['驗證信息'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="驗證信息"]/following-sibling::*[1]//text()')])
item['報送時間'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="報送時間"]/following-sibling::*[1]//text()')])
item['收錄時間'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="收錄時間"]/following-sibling::*[1]//text()')])
item['更新時間'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="更新時間"]/following-sibling::*[1]//text()')])
item['漏洞附件'] = ''.join(
[i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞附件"]/following-sibling::*[1]//text()')])
return item
def save_data(index, item, workbook):
sheet = workbook.get_sheet('sheet1') # 創建一個sheet表格
for col, value in enumerate(item.values()):
sheet.write(index, col, value)
workbook.save(filename)
print('保存成功')
def excel_prepare(heads):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) # 創建一個sheet表格
for col, value in enumerate(heads):
sheet.write(0, col, value)
return workbook
def urlisexist(url, urlset):
if url in urlset:
return True
else:
return False
def getallurl(filename):
workbook = xlrd.open_workbook(filename)
sheet1 = workbook.sheet_by_name('sheet1')
results = sheet1.col_values(0, 1)
return results
def read_old(filename):
workbook = xlrd.open_workbook(filename)
sheet1 = workbook.sheet_by_name('sheet1')
alloldset = []
for index in range(sheet1.nrows):
alloldset.append(sheet1.row_values(index))
return alloldset, sheet1.nrows
def save_old(index, olditem):
sheet = workbook.get_sheet('sheet1') # 創建一個sheet表格
for col, value in enumerate(olditem):
sheet.write(index, col, value)
workbook.save(filename)
if __name__ == '__main__':
# http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0
# 睡眠時間
TIMESLEEP = 0
filename = '工程控制系統漏洞.xls'
MAX_PAGE = 96
heads = ['url',
'標題',
'CNVD_ID',
'公開日期',
'危害級別',
'影響產品',
'BUGTRAQ_ID',
'CVE_ID',
'漏洞描述',
'漏洞類型',
'參考鏈接',
'漏洞解決方案',
'廠商補丁',
'驗證信息',
'報送時間',
'收錄時間',
'更新時間',
'漏洞附件']
try:
alloldset, length = read_old(filename)
except:
alloldset = []
length = 1
workbook = excel_prepare(heads)
for index, olditem in enumerate(alloldset):
save_old(index, olditem)
try:
urlset = getallurl(filename)
except:
urlset = []
index = length
for urlofpage in makeurl():
pagelistcontent = get_page_urllist(urlofpage)
for url in parse_urllist(pagelistcontent):
print('url is >>>', url)
if not urlisexist(url, urlset):
time.sleep(TIMESLEEP)
result = get_page(url)
item = parse_page(result, url)
print('item is >>>', item)
save_data(index, item, workbook)
index = index + 1
workbook.save(filename)
不懂的地方,下方評論提問
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
