当前位置: 首页 > news >正文

语音指令分类模型训练(基于CNN方法)

遇到的问题: 先提取MFCC特征,再使用CNN网络训练,转换后的tflite模型在移动端识别率很低,究其原因是训练样本和测试样本在提取MFCC时没有完全对齐。因此本文采取自定义keras的MFCC层进行特征提取,即网络模型的输入为长度统一的音频数据,在网络内部通过自定义的MFCC层来提取MFCC特征。如此,使用训练好的模型测试时,不需要考虑MFCC的特征提取与对齐,直接喂入跟训练时长度一致的音频数据即可。

1、加载数据,并划分训练集、验证集--> read_data.py

import os import librosa import numpy as np from sklearn.model_selection import train_test_split FIXED_SAMPLE_RATE = 16000 # 统一采样率16000Hz(语音标准) MAX_LEN = 36000 # 采样点 N_MFCC = 13 # 特征数,通常取13维 # 加载数据集(直接存音频数据,不提取特征,用于CNN网络模型训练及tflite格式转换) # X_2d:(samples, n_mfcc_feature), y_1d:(samples,) def load_data4cnn(data_pt): X, y = [], [] labels = os.listdir(data_pt) f = open("result/label.txt",'w+') for label in labels: folder = os.path.join(data_pt, label) f.write(str(label)+" "+label+"\n") for fname in os.listdir(folder): fpath = os.path.join(folder, fname) y_audio, sr = librosa.load(fpath, sr=FIXED_SAMPLE_RATE) # 统一长度 if len(y_audio) > MAX_LEN: y_audio = y_audio[:MAX_LEN] else: y_audio = np.pad(y_audio, (0, MAX_LEN - len(y_audio))) X.append(y_audio) y.append(int(label)) X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y) print("训练样本及标签:",X_train.shape,y_train.shape) print("测试样本及标签:",X_test.shape,y_test.shape) return X_train, X_test, y_train, y_test if __name__ == "__main__": load_data4cnn(data_pt='data4c')

2、自定义MFCC层 -->MFCC.py

import tensorflow as tf from keras.saving import register_keras_serializable # from tensorflow.keras.utils import register_keras_serializable @register_keras_serializable() class MFCCLayer(tf.keras.layers.Layer): def __init__(self, **kwargs): super().__init__(**kwargs) def call(self, audio): stft = tf.signal.stft( audio, frame_length=2048, frame_step=512, fft_length=2048, window_fn=tf.signal.hann_window ) spectrogram = tf.abs(stft) ** 2 mel_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=128, num_spectrogram_bins=1025, sample_rate=16000, lower_edge_hertz=0, upper_edge_hertz=8000 ) mel = tf.matmul(spectrogram, mel_matrix) log_mel = tf.math.log(mel + 1e-6) mfcc = tf.signal.mfccs_from_log_mel_spectrograms(log_mel) return mfcc[:, :, :13]

3、训练CNN模型

如果使用GPU训练,添加init_gpu.py,否则忽略下面的代码

import os import tensorflow as tf from keras import backend as K def init_gpu(): # 隐藏 TensorFlow的详细日志 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' # 0=全部, 1=INFO, 2=WARNING, 3=ERROR # GPU 优化配置 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' gpus = tf.config.list_physical_devices('GPU') print(gpus) if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print(f"✓ GPU 配置成功,使用 {len(gpus)} 个 GPU") except RuntimeError as e: print(f"GPU 配置失败: {e}") # 在训练前清理 session,释放 GPU 内存 K.clear_session()

CNN模型训练

import numpy as np from keras.models import Model from keras.layers import Input, Conv1D, MaxPooling1D, GlobalAvgPool1D, Dense, Dropout, BatchNormalization from matplotlib import pyplot as plt from read_data import load_data4cnn, MAX_LEN from init_gpu import init_gpu from MFCC import MFCCLayer # 构建 CNN 模型 def cnn_model(num_classes): inputs = Input(shape=(MAX_LEN,), name="audio_input") x = MFCCLayer()(inputs) x = Conv1D(512, 3, padding='same', activation='relu')(x) x = MaxPooling1D(2)(x) x = Dropout(0.3)(x) x = Conv1D(64, 3, padding='same', activation='relu')(x) x = GlobalAvgPool1D()(x) x = Dropout(0.35)(x) x = Dense(128, activation='relu')(x) x = Dropout(0.3)(x) x = Dense(64, activation='relu')(x) x = Dropout(0.35)(x) outputs = Dense(num_classes, activation='softmax')(x) return Model(inputs, outputs) def train_model(epochs=50): X_train, X_test, y_train, y_test = load_data4cnn("data4c") model = cnn_model(num_classes=len(np.unique(y_train))) print(model.summary()) model.compile( optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs= epochs, batch_size=32) loss, acc = model.evaluate(X_test, y_test) print(f"\nCNN 模型测试准确率:{acc * 100:.2f}%") # 保存 Keras 模型 model.save("result/cnn_model1.keras") print("\n训练完成,模型文件保存至--> result/cnn_model1.keras") result_curve(history) def result_curve(result): # # 绘制出结果 plt.figure plt.subplot(121) plt.plot(result.epoch,result.history['accuracy'],label="accuracy") plt.plot(result.epoch,result.history['val_accuracy'],label="val_accuracy") plt.scatter(result.epoch,result.history['accuracy']) plt.scatter(result.epoch,result.history['val_accuracy']) plt.legend(loc='lower right') plt.title("CNN") plt.subplot(122) plt.plot(result.epoch,result.history['loss'],label="loss") plt.plot(result.epoch,result.history['val_loss'],label="val_loss") plt.scatter(result.epoch,result.history['loss'],marker='*') plt.scatter(result.epoch,result.history['val_loss'],marker='*') plt.legend(loc='upper right') plt.title("CNN") plt.savefig('result/CNN1_curve.svg') if __name__ == "__main__": init_gpu() train_model(200)

4、转为tflite模型,后续供Android手机使用-->keras2tflite.py

import tensorflow as tf from keras.models import load_model import os from keras.saving import load_model from MFCC import MFCCLayer def keras_2_tflite(model_pt,out_pt): # 清除缓存 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' model = load_model(model_pt, safe_mode=False) # 手动创建可转换的函数,解决 TypeError: 'NoneType' object is not callable @tf.function(jit_compile=False) def inference_func(input_data): return model(input_data, training=False) # 获取输入形状 input_shape = model.input_shape concrete_func = inference_func.get_concrete_function(tf.TensorSpec(input_shape, tf.float32)) #转换 TFLite converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func], model) # 必须加这两行,否则 LSTM 无法转换 converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,tf.lite.OpsSet.SELECT_TF_OPS] converter.experimental_allow_custom_ops = True # 优化模型(可选) # converter.optimizations = [tf.lite.Optimize.DEFAULT] tflite_model = converter.convert() with open(out_pt, "wb") as f: f.write(tflite_model) print("=" * 50) print("转换成功!文件已保存到-->", out_pt) if __name__ == "__main__": keras_2_tflite(model_pt="result/cnn_model.keras",out_pt="result/cnn_model.tflite")

5、测试CNN模型

import librosa import numpy as np from keras.models import load_model from read_data import FIXED_SAMPLE_RATE, MAX_LEN, N_MFCC from MFCC import MFCCLayer dct = {0:"关灯", 1:"开灯", 2:"关闭风扇", 3:"打开风扇"} def process_data(fpt): y_audio, sr = librosa.load(fpt, sr=FIXED_SAMPLE_RATE) # 统一长度 if len(y_audio) > MAX_LEN: y_audio = y_audio[:MAX_LEN] else: y_audio = np.pad(y_audio, (0, MAX_LEN - len(y_audio))) X = np.array([y_audio]) return X def predict(file_path): model = load_model("result/cnn_lstm_model.keras") X = process_data(file_path) pred = model.predict(X)[0] print(pred) y = np.argmax(pred) cnf = pred[y] if cnf <0.3: print("未识别") return None, None else: rst = dct[y] print(f"识别结果:{y}\t置信度:{cnf}") return str(y),str(round(cnf, 2)) if __name__ == "__main__": # 测试样本 file = "test_data/3_1774506934932.wav" predict(file)

6、api封装及测试

api.py

import os from flask import Flask, request, jsonify from test_cnn_model import predict app = Flask(__name__) @app.route('/predict_api', methods=['POST']) def predict_api(): if 'file' not in request.files: return jsonify({"error": "没有文件"}), 400 file = request.files['file'] # 保存临时文件 temp_path = "temp.wav" file.save(temp_path) try: # 预测 rst, cnf = predict(temp_path) return jsonify({ "label": rst, "confidence": cnf }) except Exception as e: return jsonify({"error": str(e)}) finally: if os.path.exists(temp_path): os.remove(temp_path) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000, debug=True)

api_test.py

import requests # url = "http://192.168.1.3:5000/predict" url = "http://localhost:5000/predict_api" file_path = "test_data/0_1774506907089.wav" files = {"file": open(file_path, "rb")} res = requests.post(url, files=files) print(res.json())
http://www.jsqmd.com/news/800082/

相关文章:

  • 深入学习 Helm:K8s 的包管理器,管理复杂应用的终极指南
  • Cadence Allegro 17.4保姆级教程:PCB丝印位号重排与反标回原理图完整避坑指南
  • DeepSeek表格制作
  • Tera持久化缓存机制:如何实现毫秒级数据访问
  • 终极穿越机飞控解决方案:Betaflight如何重塑你的飞行体验
  • Kimi融资超376亿商业化成熟,DeepSeek拟募资500亿估值超515亿美元,谁能笑到最后?
  • 2026注塑厂家推荐:电子零配件加工厂+机加工镭雕厂家+钣金加工厂推荐 - 栗子测评
  • 手把手复刻1889年Kallitype专利工艺:用Midjourney生成符合John Spence历史级密度曲线的负片(含Log-C转Kallitype Density Table)
  • 构建智能代码筛选框架:从AST解析到规则引擎的工程实践
  • Windows实时语音转文字终极指南:TMSpeech让离线字幕生成如此简单
  • Python与WebAssembly:在浏览器中运行高性能Python代码实战指南
  • 如何高效进行后端开发中的数据库设计与优化
  • 51单片机项目实战:用LCD12864自制一个温湿度计(带中文界面和自定义图标)
  • Graphpack与Express集成:如何添加自定义中间件和路由
  • ScrollNice:开源鼠标滚轮替代方案,悬停滚动与高度自定义体验
  • 鼎捷数智冲刺港股:第一季营收4.4亿,扣非后净亏2112万 富士康是大股东
  • 保姆级教程:用C++在洛谷B2027、OpenJudge上正确计算球的体积(附PI定义与格式化输出详解)
  • 别再只会用df -h了!用ncdu可视化揪出Linux服务器磁盘爆满的元凶(附Docker日志清理脚本)
  • 终极Obsidian笔记模板指南:20+专业模板快速构建个人知识库
  • Tera数据库:从入门到精通,打造互联网级分布式存储系统
  • FPGA合成工具优化策略与硬件设计实践
  • 【嵌入式Linux应用开发基础】进程间通信:套接字
  • BNO055与JY901传感器选型实战:从硬件连接到精度实测
  • AI编程脚手架:用Claude代码模板提升开发效率与规范
  • 贾跃亭出任FF全球CEO,Jerry任董事长,升级为物理AI生态系统公司
  • 第二章-08-创建目录命令(mkdir)
  • 别再只存model.state_dict()了!深入理解PyTorch的state_dict,优化你的模型保存策略
  • OSINT自动化框架openeir:模块化设计与情报收集流水线构建
  • 杭州品深电源科技有限公司2026通信电源厂家精选:电源定制厂家/电源模块厂家优选杭州品深电源科技 - 栗子测评
  • 【带余除法】信息学奥赛一本通C语言解法(题号1009)