新浪博客

Python实现分词并做词频统计

2020-11-26 09:13阅读:
这两天接到一个任务是实现上万条数据自动分类,需要抽取词条里面的有效关键词,并根据每个词的词频做个词频统计。我的方式是选用Python的分词工具,jieba,具体用法在之前的另外一篇文章里有详细描述,不复述,现在直接源码: #encoding=utf-8
import xlrd
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
jieba.load_userdict('userdict.txt')
import codecs
from openpyxl import Workbook
# def insertOne(value1, value2, sheet):
# row = [value1, value2]
# sheet.append(row)



if __name__ == '__main__':
# 载入排除词1(自定义)
try:
with open('stop_words.txt', 'r',
encoding='utf-8') as f:
stop_words = []
for word in f.readlines():
w = word.strip('')
stop_words.append(w)
except:
print('加载失败')
# print(stop_words)

# 载入排除词2(省市区)
try:
with open('stop_words_area.txt', 'r', encoding='utf-8') as f:
for word in f.readlines():
w = word.strip('')
if w not in stop_words:
stop_words.append(w)
except:
print('加载失败')
# 载入政策标题
file_name = 'resource.xlsx'
xl = xlrd.open_workbook(file_name)
table = xl.sheets()[0]
rows = table.row_values(0)
cols = table.col_values(0)
# print(rows)
# print(cols)

# 对标题进行分词
word_lst = []
for mycol in cols:
print(mycol)
tags = jieba.cut(mycol.strip(),cut_all=False)
for t in tags:
if t not in stop_words and t.isspace() == False:
print(t)
word_lst.append(t)
# 去重提取分词
word_dict = {}
for item in word_lst:
if item not in word_dict:
word_dict[item] = 1
else:
word_dict[item] +=1

# print(word_lst)
# print(word_dict)

# 排序输出
sorted_dict = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_dict)
# for i in range(0,len(sorted_dict)):
# print(sorted_dict[i])

# 数据存储
file = open('segment_frequency.txt', 'w', encoding='utf-8')
for k, v in sorted_dict:
if v>10:
file.write(str(k) + ' ' + str(v) + '')
file.close()

我的更多文章

下载客户端阅读体验更佳

APP专享