26 Star 61 Fork 8

Jtyoui / snsg

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
neology.py 6.03 KB
一键复制 编辑 原始数据 按行查看 历史
tyoui 提交于 2019-01-20 01:57 . 增加线程,优化代码
#!/usr/bin/python3
# -*- coding=utf-8 -*-
"""自动训练文本信息
@author: 张伟
@time: 2018/2/28 10:58
"""
import math
import re
from threading import Thread
import queue as queues
ALL_WORDS = dict()
All_LENS = 0
class Neologism(Thread):
def __init__(self, queue, split_num=4):
Thread.__init__(self)
self.queue = queue
self.split_num = split_num
def run(self):
while True:
try:
line = self.queue.get_nowait()
self.read_string(line)
self.queue.task_done()
except queues.Empty:
return
def read_string(self, st, split_seq='[,。!?:]'):
"""
讲字符按照split_seq格式来分割
:param st: 字符串
:param split_seq: 字符分割
:return: None
"""
ls = re.split(split_seq, st)
self.read_ls(ls=ls)
def read_ls(self, ls):
"""数据类型[str]
:param ls: 表示链表
"""
global All_LENS
for word in ls:
All_LENS += len(word)
clean_data = clean(data=word)
if clean_data:
self.split(clean_data)
def split(self, words):
"""
拆分字符,最大匹配num个字符,并也字典的形式返回,
[出现次数,出现频率,凝固程度,自由程度,关键字的左邻,关键字的右邻](作为信息熵的衡量)
"""
global ALL_WORDS
lens = len(words)
for i in range(0, lens):
for j in range(1, self.split_num + 1):
if i + j < lens:
key = words[i:i + j]
word = ALL_WORDS.get(key)
if word:
word[0] += 1
word[4].append(words[i - 1])
word[5].append(words[i + j])
else:
ALL_WORDS[key] = [1, 0.0, 1, 0, [words[i - 1]], [words[i + j]]]
def statistics(key_list):
for key in key_list:
ALL_WORDS[key][1] = ALL_WORDS[key][0] / All_LENS
def handle(key_list):
"""
处理数据
计算左邻字集合和右邻字集合有多随机,左邻字信息熵和右邻字信息熵中的较小值
计算凝固程度,自由程度
"""
for key in key_list:
key_list_words = ALL_WORDS[key]
if len(key) == 1:
continue
end_all = front_all = 0.0
left = key_list_words[1] / (ALL_WORDS[key[0]][1] * ALL_WORDS[key[1:]][1])
right = key_list_words[1] / (ALL_WORDS[key[-1]][1] * ALL_WORDS[key[:-1]][1])
for front in key_list_words[4]:
if ALL_WORDS.get(front):
front_all -= math.log(ALL_WORDS[front][1]) * ALL_WORDS[front][1]
for end in key_list_words[5]:
if ALL_WORDS.get(end):
end_all -= math.log(ALL_WORDS[end][1]) * ALL_WORDS[end][1]
key_list_words[2] = left if left < right else right
key_list_words[3] = front_all if front_all < end_all else end_all
def filter_words(frequency, cond, free, flag):
"""
过滤一些不重要的数据
[出现次数,出现频率,凝固程度,自由程度]
:param frequency: 过滤的频率
:param cond:过滤凝聚度
:param free:过滤自由度
:param flag: 是否是并且还是或者,默认是或者,满足一个就过滤
:return:过滤后的数据字典
"""
key_words = dict()
for key in ALL_WORDS.keys():
if len(key) <= 1:
continue
one_word = ALL_WORDS[key]
if flag:
if one_word[1] > frequency and one_word[2] > cond and one_word[3] > free:
key_words[key] = [one_word[0], one_word[1], one_word[2], one_word[3]]
else:
if one_word[1] > frequency or one_word[2] > cond or one_word[3] > free:
key_words[key] = [one_word[0], one_word[1], one_word[2], one_word[3]]
return key_words
def read_file(file, file_encoding='utf-8'):
"""
读取文件内容,注意文件是UTF-8的格式且不是BOM格式
:param file: 读取的文件
:param file_encoding: 文本编码
"""
queue = queues.Queue(maxsize=0)
with open(file, encoding=file_encoding) as fp:
for line in fp:
queue.put(line)
return queue
def clean(data):
# 去除非中文字符
words = [work for work in data if 19968 < ord(work) < 40959]
if len(words) > 2:
return ''.join(words)
return None
def analysis(file, thread_num=10, split_num=4, frequency=0.0001, cond=10, free=0.1, flag=False):
"""
:param file: 训练的文本
:param thread_num: 线程数
:param split_num: 匹配个数
:param frequency: 频率
:param cond: 凝聚度
:param free: 自由度
:param flag:是否是并且还是或者,默认是或者,满足一个就过滤
:return: 分析完毕的字典
"""
queue = read_file(file)
neologisms = [Neologism(split_num=split_num, queue=queue) for _ in range(thread_num)]
for neologism in neologisms:
neologism.start()
queue.join()
print("开始统计频率.........")
keys_list = list(ALL_WORDS.keys())
size = len(keys_list) // split_num + 1
threads = []
for i in range(split_num):
t = Thread(target=statistics, args=(keys_list[i * size:(i + 1) * size],))
threads.append(t)
t.start()
for t in threads:
t.join()
print("开始处理数据.........")
for i in range(split_num):
t = Thread(target=handle, args=(keys_list[i * size:(i + 1) * size],))
threads.append(t)
t.start()
for t in threads:
t.join()
print("开始过滤数据.........")
neologism_word = filter_words(frequency, cond, free, flag)
print("分析完毕!..........")
return neologism_word
if __name__ == '__main__':
neologism_words = analysis(file='小时代.txt', thread_num=10, frequency=0.00001, split_num=8, cond=100, flag=True)
for k, v in neologism_words.items():
print('key:{0} count:{1} frequency:{2} cond:{3} free:{4}'.format(k, v[0], v[1], v[2], v[3]))
Python
1
https://gitee.com/jtyoui/snsg.git
git@gitee.com:jtyoui/snsg.git
jtyoui
snsg
snsg
master

搜索帮助