"""python提取文本的tfidf特征"""
import math
from collections import Counter
# 1.語料庫
corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
# 2.對語料進行分詞
word_list = []
for i in range(len(corpus)):
word_list.append(corpus[i].split(' '))
print('2-->', word_list)
# 3.統計詞頻
countlist = []
for i in range(len(word_list)):
count = Counter(word_list[i])
countlist.append(count)
print('3詞頻-->', countlist)
# 4.定義計算tfidf公式的函數
# count[word]可以得到每個單詞的詞頻, sum(count.values())得到整個句子的單詞總數
def tf(word, count):
return count[word] / sum(count.values())
# 統計的是含有該單詞的句子數
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
# len(count_list)是指句子的總數,n_containing(word, count_list)是指含有該單詞的句子的總數,加1是為了防止分母為0
def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))
# 將tf和idf相乘
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
all_dict = {}
for counte in countlist:
counter = dict(counte)
for k, v in counter.items():
try:
all_dict[k] += v
except:
all_dict[k] = v
print('merge-->', all_dict)
with open('tf.txt', 'w+') as tfin, open('idf.txt', 'w+') as idfin:
for k in all_dict.keys():
# k_tf = tf(k, all_dict)
tfin.write(k + ' ' + str(all_dict[k]) + '\n')
k_idf = idf(k, countlist)
idfin.write(k + ' ' + str(k_idf) + '\n')
?
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
