当前位置：首页 > news >正文

13.多行文本读取、遍历

news 2026/4/28 23:23:04

# -*- coding: utf-8 -*-""" @Created on ： 2026/4/28 9:50 @creator ： er_nao @File ：Day_13.py @Description ：多行文本读取、遍历 """""" 核心知识点 1：多行文本读取的 4 种核心方法 """""" 方法1：readlines () 一次性全量读取（最常用，NLP 场景首选） 适用场景： 读取中小体积的 txt 文件（比如对话历史、短文档、评论数据），文件大小在几百 MB 以内 你需要对所有行做批量处理、二次筛选、统计分析，90% 的 NLP 场景都用这个方法 """withopen('C:\\Users\\hp\\Desktop\\words_list.txt','r',encoding='utf-8')asfile:# 全量读取所有行all_lines=file.readlines()# 基础处理：过滤掉空行，去除每行的换行符valid_lines=[]forlineinall_lines:clean_line=line.strip()# 过滤掉空行ifclean_line!='':valid_lines.append(clean_line)print(f'【读取结果】：总共有{len(all_lines)}行，有效行{len(valid_lines)}行')print('【有效对话内容】：')fori,lineinenumerate(valid_lines):print(f'第{i+1}行：{line}')""" 方法 2：逐行遍历读取（for line in file，大文件首选） 适用场景: 读取超大体积的 txt 文件（比如几个 GB 的语料库、日志文件），一次性读取会占满内存，逐行读取更省内存 你只需要处理符合条件的行，不需要读取整个文件，比如提取包含指定关键词的行 """target_lines=[]# 打开文件，逐行遍历withopen('C:\\Users\\hp\\Desktop\\words_list.txt','r',encoding='utf-8')asfile:# 逐行遍历，每循环一次，就读取一行forlineinfile:clean_line=line.strip()if'NLP'inclean_line:target_lines.append(clean_line)# 打印结果print(f'【提取结果】：共找到{len(target_lines)}条包含NLP的内容')forlineintarget_lines:print(f'{line}')""" 方法 3：read () + splitlines () 读取后拆分多行（最灵活） 适用场景: 你需要先对整个文本做统一的预处理（比如全量清洗、去除特殊符号），然后再拆分成多行 文本里的换行符格式不统一（有\n、\r\n），splitlines()会自动统一处理 """withopen('C:\\Users\\hp\\Desktop\\words_list.txt','r',encoding='utf-8')asfile:full_text=file.read()importre clean_full_text=re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s，。！？；：]','',full_text)clean_full_text=re.sub(r'\s+',' ',clean_full_text).strip()# 按换行符拆分成多行列表lines_list=clean_full_text.splitlines()# 过滤空行valid_lines=[line.strip()forlineinlines_listifline.strip()!='']# 打印清洗后的结果forlineinvalid_lines:print(f'{line}')""" 方法 4：按指定分隔符读取多行 适用场景: 读取结构化的文档、论文、产品手册，按空行拆分段落 读取对话历史，按特殊标记（比如用户：、AI：）拆分对话轮次 """withopen('C:\\Users\\hp\\Desktop\\nlp_result.txt','r',encoding='utf-8')asfile:full_text_4=file.read()# 按2个连续的换行符（空行）拆分，得到多个段落paragraph=full_text_4.split('\n\n')# 过滤空段落，清洗每个段落valid_paragraph=[]forparinparagraph:clean_par=par.strip().replace('\n',' ')ifclean_par!='':valid_paragraph.append(clean_par)print(f'【文档读取结果】：一共拆分{len(valid_paragraph)}个段落')fori,parainenumerate(valid_paragraph):print(f"\n第{i+1}段：{para[:3]}..."iflen(para)>3elsef"\n第{i+1}段：{para}")""" 核心知识点 2：多行文本的遍历与处理 """""" 方式 1：基础遍历 + 逐行清洗（最常用） """defclean_line_text(line):"""逐行清洗文本"""clean_line=line.strip()# 去除特殊符号、乱码clean_line=re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s，。！？；：]','',clean_line)# 统一多个空格为1个clean_line=re.sub(r'\s+',' ',clean_line)# 返回清洗后的行returnclean_line# 读取多行文本withopen('C:\\Users\\hp\\Desktop\\test.txt','r',encoding='utf-8')asfile:all_lines=file.readlines()# 遍历所有行，逐行清洗clean_lines=[]forlineinall_lines:clean_line=clean_line_text(line)# 过滤空行ifclean_line!='':clean_lines.append(clean_line)# 打印结果print('【逐行清洗后的结果】')forlineinclean_lines:print(line)""" 方式 2：批量遍历 + 统一 NLP 处理 """importjieba STOP_WORDS=["的","是","在","和","有","也","就","都","而","及","与","着","，","。","！","？","；","：","、"," "]# 读取并清洗多行文本withopen('C:\\Users\\hp\\Desktop\\test.txt','r',encoding='utf-8')asfile:all_lines=file.readlines()# 清洗所有行clean_lines=[line.strip()forlineinall_linesifline.strip()!='']# 遍历所有行，逐行分词、过滤停用词all_words=[]forlineinclean_lines:#分词words=jieba.lcut(line)# 过滤停用词valid_words=[wordforwordinwordsifwordnotinSTOP_WORDSandword.strip()!='']all_words.extend(valid_words)# 统计高频词word_count={}forwordinall_words:ifwordinword_count:word_count[word]=word_count[word]+1else:word_count[word]=1# 按照出现次数倒序排序sorted_word_count=sorted(word_count.items(),key=lambdax:x[1],reverse=True)print(f'【所有行的分词结果】：{all_words}')print('【高频词出现次数】：')forword,countinsorted_word_count[:10]:print(f'{word},出现{count}次')