Python3处理文件中每个词的方法

2020-01-04 18:10:32

字体：大中小

来源：转载

供稿：网友

这篇文章主要介绍了Python3处理文件中每个词的方法,可实现逐个处理文件中每个词的功能,需要的朋友可以参考下

本文实例讲述了Python3处理文件中每个词的方法。分享给大家供大家参考。具体实现方法如下：

'''''''

Created on Dec 21, 2012

处理文件中的每个词

@author: liury_lab

'''

import codecs

the_file = codecs.open('d:/text.txt', 'rU', 'UTF-8')

for line in the_file:

for word in line.split():

print(word, end = "|")

the_file.close()

# 若词的定义有变，可使用正则表达式

# 如词被定义为数字字母，连字符或单引号构成的序列

import re

the_file = codecs.open('d:/text.txt', 'rU', 'UTF-8')

print()

print('************************************************************************')

re_word = re.compile('[/w/'-]+')

for line in the_file:

for word in re_word.finditer(line):

print(word.group(0), end = "|")

the_file.close()

# 封装成迭代器

def words_of_file(file_path, line_to_words = str.split):

the_file = codecs.open('d:/text.txt', 'rU', 'UTF-8')

for line in the_file:

for word in line_to_words(line):

yield word

the_file.close()

print()

print('************************************************************************')

for word in words_of_file('d:/text.txt'):

print(word, end = '|')

def words_by_re(file_path, repattern = '[/w/'-]+'):

the_file = codecs.open('d:/text.txt', 'rU', 'UTF-8')

re_word = re.compile('[/w/'-]+')

def line_to_words(line):

for mo in re_word.finditer(line):

yield mo.group(0) # 原书为return，发现结果不对，改为yield

return words_of_file(file_path, line_to_words)

print()

print('************************************************************************')

for word in words_by_re('d:/text.txt'):

print(word, end = '|')