当前位置：首页 > news >正文

Pythonio字节流与文本流

news 2026/7/25 19:05:22

Python io 字节流与文本流
===============================

io 模块提供流式 I/O 的核心抽象: 文本流、二进制流和原始流。

1. StringIO — 内存文本流
----------------------------

from io import StringIO

# StringIO 将字符串作为文件对象操作 (读写均在内存中)
buffer = StringIO()
buffer.write("第一行\n")
buffer.write("第二行\n")
buffer.write("第三行\n")

# getvalue() 获取全部内容
content = buffer.getvalue()
print("StringIO 内容:\n", content)

# 像文件一样 seek/read
buffer.seek(0) # 回到开头
print("readline:", buffer.readline().strip()) # 第一行
print("readline:", buffer.readline().strip()) # 第二行

# 用 for 循环遍历
buffer.seek(0)
for line in buffer:
print(" 遍历:", line.strip())

# 用初始数据构造
csv_data = "name,age\nAlice,25\nBob,30\n"
f = StringIO(csv_data)
import csv
reader = csv.reader(f)
for row in reader:
print("CSV 行:", row)

buffer.close() # 释放内存

# 最佳实践: 使用 with 语句保证资源释放
with StringIO() as buf:
buf.write("临时数据")
data = buf.getvalue()
print("with 语句获取:", data)

2. BytesIO — 内存二进制流
-----------------------------

from io import BytesIO

# BytesIO 处理二进制数据 (如图片、压缩文件、序列化数据)
buf = BytesIO()
buf.write(b"\x00\x01\x02")
buf.write(bytes([3, 4, 5]))
print("BytesIO 全部数据:", buf.getvalue()) # b'\x00\x01\x02\x03\x04\x05'

# 应用: 模拟文件上传 / 网络传输
def process_upload(data: bytes):
"""模拟处理上传的二进制数据"""
stream = BytesIO(data)
header = stream.read(4) # 读取前4字节
print("文件头:", header.hex()) # 十六进制展示
stream.seek(0, 2) # 定位到末尾
print("总大小:", stream.tell(), "字节") # 文件总大小
stream.seek(0)
return stream

# BytesIO + struct 解析二进制协议
import struct
binary_data = struct.pack("!I 2s H", 1, b"AB", 256)
buf = BytesIO(binary_data)
field1 = struct.unpack("!I", buf.read(4))[0]
field2 = struct.unpack("!2s", buf.read(2))[0]
field3 = struct.unpack("!H", buf.read(2))[0]
print(f"BytesIO 解析: {field1}, {field2}, {field3}") # 1, b'AB', 256

3. IO 类层次结构
--------------------

from io import IOBase, RawIOBase, BufferedIOBase, TextIOBase

# IO 类的继承关系:
# IOBase (抽象基类)
# ├── RawIOBase (原始二进制流: readinto, read, write)
# ├── BufferedIOBase (带缓冲的二进制流: BufferedWriter, BufferedReader, BufferedRWPair)
# └── TextIOBase (文本流: read, write, seek)

# 检查对象的类型
with open("example.txt", "w") as f: # 实际不会创建文件, 无妨
pass

# 使用实际文件流演示
from io import FileIO, BufferedReader, TextIOWrapper

# FileIO: 原始二进制文件流
# BufferedReader: 带缓冲的二进制读取 (默认 buffer 大小 8192)
# TextIOWrapper: 文本层包装, 处理编码

# 层次关系: TextIOWrapper -> BufferedReader -> FileIO
# open() 的默认行为:
# 文本模式: open("f.txt", "r") = TextIOWrapper(BufferedReader(FileIO("f.txt")))
# 二进制模式: open("f.txt", "rb") = BufferedReader(FileIO("f.txt"))

print("IOBase 子类关系图 (概念):")
print(" open('r') → TextIOWrapper → BufferedReader → FileIO")
print(" open('rb') → BufferedReader → FileIO")

4. TextIOWrapper — 编码处理
-------------------------------

from io import TextIOWrapper, BytesIO

# 场景: 读取字节流时指定编码
raw_bytes = b"\xe4\xb8\xad\xe6\x96\x87" # "中文" 的 UTF-8 编码
byte_stream = BytesIO(raw_bytes)

# 用 TextIOWrapper 包装字节流, 指定编码
text_stream = TextIOWrapper(byte_stream, encoding="utf-8")
content = text_stream.read()
print("TextIOWrapper 解码:", content) # 中文

# 常见编码处理场景:
# GBK 编码的数据
gbk_bytes = b"\xd6\xd0\xce\xc4" # "中文" 的 GBK 编码
gbk_stream = BytesIO(gbk_bytes)
gbk_text = TextIOWrapper(gbk_stream, encoding="gbk")
print("GBK 解码:", gbk_text.read()) # 中文

# 写入时指定编码
out_stream = BytesIO()
text_out = TextIOWrapper(out_stream, encoding="utf-8")
text_out.write("你好, 世界!")
text_out.flush() # 刷新缓冲区到 BytesIO
print("UTF-8 编码结果:", out_stream.getvalue()) # b'\xe4\xbd\xa0\xe5\xa5\xbd...'

# 注意: TextIOWrapper 关闭时会关闭底层流, 用 detach() 分离
raw = BytesIO(b"test")
tw = TextIOWrapper(raw, encoding="utf-8")
detached = tw.detach() # 分离底层流, tw 不可再用
print("detach 后底层流:", detached.read()) # b'test'

5. 管道模拟: 使用 IO 流模拟进程间通信
------------------------------------------

# 场景: 用 StringIO 模拟标准输入/输出重定向
import sys
from io import StringIO

def capture_output(func, *args, **kwargs):
"""捕获函数的标准输出"""
old_stdout = sys.stdout
sys.stdout = StringIO() # 重定向 stdout
try:
func(*args, **kwargs)
return sys.stdout.getvalue()
finally:
sys.stdout = old_stdout # 恢复 stdout

def hello():
print("Hello, World!")
print("这是被捕获的输出")

captured = capture_output(hello)
print("捕获的内容:", repr(captured))

# 模拟 stdin 输入
def simulate_input(data: str):
"""模拟用户输入"""
old_stdin = sys.stdin
sys.stdin = StringIO(data)
try:
user_input = input("请输入: ")
return user_input
finally:
sys.stdin = old_stdin

result = simulate_input("模拟的用户输入\n")
print("模拟输入结果:", result)

6. 流式解码 — 处理不完整的字节序列
---------------------------------------

from io import TextIOWrapper, BytesIO

# 场景: 网络传输中收到的分段数据可能跨字符边界
def stream_decode_example():
"""演示流式解码如何处理跨多段数据的字符"""
# "你好" 的 UTF-8 编码: 每个汉字 3 字节
data = b"\xe4\xbd\xa0\xe5\xa5\xbd"
# 故意分成两个不完整的片段
chunk1 = data[:4] # \xe4\xbd\xa0\xe5 (不完整, 缺少 \xa5\xbd)
chunk2 = data[4:] # \xa5\xbd

# TextIOWrapper 内部维护解码状态, 可处理跨块字符
stream = BytesIO()
wrapper = TextIOWrapper(stream, encoding="utf-8")

# 模拟收到第一个块
stream.write(chunk1)
wrapper.flush() # 刷新缓冲区, 但解码器保持中间状态
# 这里 wrapper 可能已部分解码 "\xe4\xbd\xa0" (你), 但 \xe5\xa5... 未完成

# 收到第二块
stream.write(chunk2)
wrapper.flush()

# 读取完整结果
stream.seek(0)
result = wrapper.read()
print("流式解码结果:", result) # 你好

wrapper.close()

stream_decode_example()

7. 创建自定义流对象
-----------------------

from io import RawIOBase, BufferedIOBase
import os

class SimpleRawIO(RawIOBase):
"""自定义原始 I/O: 模拟从内存缓冲区读取"""

def __init__(self, data: bytes):
self._data = data
self._pos = 0

def readinto(self, b):
"""读取数据到提供的缓冲区 bytearray"""
n = min(len(b), len(self._data) - self._pos)
b[:n] = self._data[self._pos:self._pos + n]
self._pos += n
return n

def readable(self):
return True

def read(self, size=-1):
"""可选的优化: 直接返回 bytes"""
if size == -1:
result = self._data[self._pos:]
self._pos = len(self._data)
else:
result = self._data[self._pos:self._pos + size]
self._pos += len(result)
return result

# 使用自定义流
custom_stream = SimpleRawIO(b"Hello Custom IO!")
print("自定义流读取:", custom_stream.read(5)) # b'Hello'
print("继续读取:", custom_stream.read()) # b' Custom IO!'

# 实际应用: 文件系统抽象, 加密流包装
class Rot13Wrapper(RawIOBase):
"""ROT13 加密包装流: 读取时自动解码"""

def __init__(self, underlying):
self._stream = underlying

def readinto(self, b):
n = self._stream.readinto(b)
if n:
# 对每个字节应用 ROT13 变换
for i in range(n):
byte = b[i]
if ord('a') <= byte <= ord('z'):
b[i] = ord('a') + (byte - ord('a') + 13) % 26
elif ord('A') <= byte <= ord('Z'):
b[i] = ord('A') + (byte - ord('A') + 13) % 26
return n

def readable(self):
return True

# 演示 ROT13
raw = BytesIO(b"Hello World!")
rot13 = Rot13Wrapper(raw)
result = bytearray(20)
n = rot13.readinto(result)
print("ROT13 编码:", result[:n]) # b'Uryyb Jbeyq!'

总结: io 模块支持在内存中模拟文件操作, 对测试、数据转换和协议处理非常有用.