看代码吧~
import re import jieba.analyse import codecs import pandas as pd def simplification_text(xianbingshi): \"\"\"提取文本\"\"\" xianbingshi_simplification = [] with codecs.open(xianbingshi,\'r\',\'utf8\') as f: for line in f : line = line.strip() line_write = re.findall(\'(?<=\\<b\\>).*?(?=\\<e\\>)\',line) for line in line_write: xianbingshi_simplification.append(line) with codecs.open(r\'C:\\Users\\Administrator.SC-201812211013\\PycharmProjects\\untitled29\\yiwoqu\\code\\xianbingshi_write.txt\',\'w\',\'utf8\') as f: for line in xianbingshi_simplification: f.write(line + \'\\n\') def jieba_text(): \"\"\"\"\"\" word_list = [] data = open(r\"C:\\Users\\Administrator.SC-201812211013\\PycharmProjects\\untitled29\\xianbingshi_write.txt\", encoding=\'utf-8\').read() seg_list = jieba.cut(data, cut_all=False) # 精确模式 for i in seg_list: word_list.append(i.strip()) data_quchong = pd.DataFrame({\'a\':word_list}) data_quchong.drop_duplicates(subset=[\'a\'],keep=\'first\',inplace=True) word_list = data_quchong[\'a\'].tolist() with codecs.open(\'word.txt\',\'w\',\'utf8\')as w: for line in word_list: w.write(line + \'\\n\') def word_messy(word): \"\"\"词语提炼\"\"\" word_sub_list = [] with codecs.open(word,\'r\',\'utf8\') as f: for line in f: line_sub = re.sub(\"^[1-9]\\d*\\.\\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\\d+)(\\.\\d+)?$|^[A-Za-z0-9]{4,40}.*?\",\'\',line) word_sub_list.append(line_sub) word_sub_list.sort() with codecs.open(\'word.txt\',\'w\',\'utf8\')as w: for line in word_sub_list: w.write(line.strip(\"\\n\") + \'\\n\') if __name__ == \'__main__\': xianbingshi = r\'C:\\Users\\Administrator.SC-201812211013\\PycharmProjects\\untitled29\\yiwoqu\\xianbingshi_sub_sen_all(1).txt\' # simplification_text(xianbingshi) # word = r\'C:\\Users\\Administrator.SC-201812211013\\PycharmProjects\\untitled29\\word.txt\' simplification_text(xianbingshi)
补充:python 进行结巴分词 并且用re去掉符号
看代码吧~
# 把停用词做成字典 stopwords = {} fstop = open(\'stop_words.txt\', \'r\',encoding=\'utf-8\',errors=\'ingnore\') for eachWord in fstop: stopwords[eachWord.strip()] = eachWord.strip() #停用词典 fstop.close() f1=open(\'all.txt\',\'r\',encoding=\'utf-8\',errors=\'ignore\') f2=open(\'allutf11.txt\',\'w\',encoding=\'utf-8\') line=f1.readline() while line: line = line.strip() #去前后的空格 line = re.sub(r\"[0-9\\s+\\.\\!\\/_,$%^*()?;;:-【】+\\\"\\\']+|[+——!,;:。?、~@#¥%……&*()]+\", \" \", line) #去标点符号 seg_list=jieba.cut(line,cut_all=False) #结巴分词 outStr=\"\" for word in seg_list: if word not in stopwords: outStr+=word outStr+=\" \" f2.write(outStr) line=f1.readline() f1.close() f2.close()
以上为个人经验,希望能给大家一个参考,也希望大家多多支持自学编程网。