当前位置: 首页 > news >正文

编写 MapReduce 程序清洗信件内容数据

  1. 项目结构
    text
    LetterCleaning/
    ├── src/main/java/com/letter/cleaning/
    │ ├── LetterCleaner.java # 主清洗程序
    │ ├── LetterMapper.java # Mapper类
    │ ├── LetterReducer.java # Reducer类
    │ └── LetterWritable.java # 自定义数据类型
    ├── input/ # 输入数据目录
    └── output/ # 输出数据目录
  2. 完整代码实现
    java
    package com.letter.cleaning;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class LetterCleaner {

/*** Mapper类 - 负责数据清洗和过滤*/
public static class LetterMapper extends Mapper<Object, Text, Text, Text> {private Text outKey = new Text();private Text outValue = new Text();// 正则表达式模式private Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");private Pattern phonePattern = Pattern.compile("1[3-9]\\d{9}");private Pattern idCardPattern = Pattern.compile("\\d{17}[0-9Xx]");private Pattern emailPattern = Pattern.compile("\\w+@\\w+\\.\\w+");private Pattern specialCharPattern = Pattern.compile("[<>\"'&]");@Overridepublic void map(Object key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();// 跳过空行if (line.trim().isEmpty()) {return;}try {// 清洗单条数据CleanedLetter letter = cleanLetter(line);// 验证数据完整性if (validateLetter(letter)) {outKey.set(letter.getDate());outValue.set(letter.toString());context.write(outKey, outValue);}} catch (Exception e) {// 记录错误数据,但不中断程序System.err.println("Error processing line: " + line);context.getCounter("LetterCleaner", "Error Lines").increment(1);}}/*** 清洗单条信件数据*/private CleanedLetter cleanLetter(String rawData) {CleanedLetter letter = new CleanedLetter();// 假设输入格式:标题||日期||内容||来源String[] parts = rawData.split("\\|\\|");if (parts.length >= 3) {// 清洗标题letter.setTitle(cleanText(parts[0]));// 清洗日期letter.setDate(extractDate(parts[1]));// 清洗内容String content = cleanText(parts[2]);content = removePrivacyInfo(content);content = removeHtmlTags(content);content = normalizeWhitespace(content);letter.setContent(content);// 清洗来源(如果有)if (parts.length >= 4) {letter.setSource(cleanText(parts[3]));}// 提取元数据letter.setWordCount(countWords(content));letter.setProcessTime(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()));}return letter;}/*** 清洗文本:去除特殊字符、多余空格*/private String cleanText(String text) {if (text == null) return "";// 去除首尾空格text = text.trim();// 替换HTML实体text = text.replace("&nbsp;", " ").replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&").replace("&quot;", "\"");// 去除控制字符text = text.replaceAll("[\\x00-\\x1F\\x7F]", "");return text;}/*** 提取标准日期格式*/private String extractDate(String dateStr) {Matcher matcher = datePattern.matcher(dateStr);if (matcher.find()) {return matcher.group();}// 尝试其他常见格式try {SimpleDateFormat[] formats = {new SimpleDateFormat("yyyy/MM/dd"),new SimpleDateFormat("yyyy年MM月dd日"),new SimpleDateFormat("yyyy.MM.dd")};for (SimpleDateFormat format : formats) {try {Date date = format.parse(dateStr);return new SimpleDateFormat("yyyy-MM-dd").format(date);} catch (Exception e) {continue;}}} catch (Exception e) {// 忽略解析错误}return "0000-00-00";}/*** 去除隐私信息(手机号、身份证等)*/private String removePrivacyInfo(String text) {// 替换手机号Matcher phoneMatcher = phonePattern.matcher(text);text = phoneMatcher.replaceAll("[手机号已隐藏]");// 替换身份证号Matcher idMatcher = idCardPattern.matcher(text);text = idMatcher.replaceAll("[身份证已隐藏]");// 替换邮箱Matcher emailMatcher = emailPattern.matcher(text);text = emailMatcher.replaceAll("[邮箱已隐藏]");return text;}/*** 去除HTML标签*/private String removeHtmlTags(String text) {// 去除HTML标签text = text.replaceAll("<[^>]*>", "");// 去除CSS样式text = text.replaceAll("<style[^>]*>[^<]*</style>", "");// 去除JavaScripttext = text.replaceAll("<script[^>]*>[^<]*</script>", "");return text;}/*** 规范化空白字符*/private String normalizeWhitespace(String text) {// 将多个空格合并为一个text = text.replaceAll("\\s+", " ");// 确保中文标点前后无空格text = text.replaceAll("\\s+([,。!?;:])", "$1");text = text.replaceAll("([,。!?;:])\\s+", "$1");return text;}/*** 统计词数*/private int countWords(String text) {if (text == null || text.isEmpty()) return 0;return text.split("\\s+").length;}/*** 验证数据完整性*/private boolean validateLetter(CleanedLetter letter) {// 标题不能为空if (letter.getTitle() == null || letter.getTitle().isEmpty()) {return false;}// 日期必须有效if (letter.getDate() == null || letter.getDate().equals("0000-00-00")) {return false;}// 内容不能为空且有一定长度if (letter.getContent() == null || letter.getContent().length() < 10) {return false;}return true;}
}/*** Reducer类 - 可选,用于进一步聚合处理*/
public static class LetterReducer extends Reducer<Text, Text, Text, NullWritable> {@Overridepublic void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {// 按日期分组,可以在这里进行去重或其他聚合操作for (Text value : values) {context.write(value, NullWritable.get());}}
}/*** 主方法 - 配置和运行Job*/
public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherArgs.length < 2) {System.err.println("Usage: LetterCleaner <input path> <output path>");System.exit(2);}Job job = Job.getInstance(conf, "Letter Data Cleaner");job.setJarByClass(LetterCleaner.class);job.setMapperClass(LetterMapper.class);job.setReducerClass(LetterReducer.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);// Mapper输出类型(如果与Reducer不同)job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);// 设置输入输出路径FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));// 设置Reducer数量job.setNumReduceTasks(3);System.exit(job.waitForCompletion(true) ? 0 : 1);
}

}

/**

  • 自定义数据类型 - 清洗后的信件对象
    */
    class CleanedLetter {
    private String title;
    private String date;
    private String content;
    private String source;
    private int wordCount;
    private String processTime;

    // 构造函数
    public CleanedLetter() {
    this.title = "";
    this.date = "";
    this.content = "";
    this.source = "";
    this.wordCount = 0;
    this.processTime = "";
    }

    // Getters and Setters
    public String getTitle() { return title; }
    public void setTitle(String title) { this.title = title; }

    public String getDate() { return date; }
    public void setDate(String date) { this.date = date; }

    public String getContent() { return content; }
    public void setContent(String content) { this.content = content; }

    public String getSource() { return source; }
    public void setSource(String source) { this.source = source; }

    public int getWordCount() { return wordCount; }
    public void setWordCount(int wordCount) { this.wordCount = wordCount; }

    public String getProcessTime() { return processTime; }
    public void setProcessTime(String processTime) { this.processTime = processTime; }

    @Override
    public String toString() {
    return String.join("\t",
    title, date, content, source,
    String.valueOf(wordCount), processTime
    );
    }
    }

  1. POM.xml 依赖配置
    xml

    4.0.0
    com.letter
    letter-cleaning
    1.0

    3.3.4org.apache.hadoophadoop-common${hadoop.version}
     <dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-core</artifactId><version>${hadoop.version}</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-jobclient</artifactId><version>${hadoop.version}</version></dependency><!-- 日志 --><dependency><groupId>org.slf4j</groupId><artifactId>slf4j-log4j12</artifactId><version>1.7.30</version></dependency>
    
    org.apache.maven.pluginsmaven-compiler-plugin3.8.11.81.8
         <!-- 打包插件 --><plugin><artifactId>maven-assembly-plugin</artifactId><configuration><descriptorRefs><descriptorRef>jar-with-dependencies</descriptorRef></descriptorRefs></configuration><executions><execution><id>make-assembly</id><phase>package</phase><goals><goal>single</goal></goals></execution></executions></plugin></plugins>
    
4. 运行脚本 bash #!/bin/bash

编译打包

mvn clean package

创建输入目录

hdfs dfs -mkdir -p /user/letter/input

上传数据到HDFS

hdfs dfs -put ./input/* /user/letter/input/

运行MapReduce作业

hadoop jar target/letter-cleaning-1.0-jar-with-dependencies.jar
com.letter.cleaning.LetterCleaner
/user/letter/input
/user/letter/output

查看结果

hdfs dfs -cat /user/letter/output/part-r-00000 | head -20

下载结果到本地

hdfs dfs -get /user/letter/output ./cleaned_data
5. 数据清洗效果示例
清洗前:

text
关于小区停车难问题||2024/01/15||

尊敬的领导:

您好!我们小区停车非常困难,我的电话是13800138000。希望能解决。||网站
清洗后:

text
关于小区停车难问题 2024-01-15 尊敬的领导:您好!我们小区停车非常困难,我的电话是[手机号已隐藏]。希望能解决。 网站 18 2024-03-20 14:30:25

http://www.jsqmd.com/news/385111/

相关文章:

  • Spring Boot Test深度解析
  • 怎么搭建OpenClaw?2026年天翼云搭建简易指南
  • 避坑!AI提示系统用户导向设计的10个常见误区与正确做法
  • Nock深度解析
  • 怎么安装OpenClaw?2026年移动云一键部署入门
  • Mirage JS深度解析
  • 数据预处理助力大数据领域的智能决策支持
  • AI原生应用领域知识库构建:提升应用智能的核心策略
  • 2026年南宁管道疏通推荐:多场景管道问题解决评价,直击堵塞与溢流痛点 - 十大品牌推荐
  • 什么是OpenClaw?2026年华为云一键部署入门
  • 提示工程回归测试的成本控制:这6个策略帮你节省30%预算
  • Testcontainers深度解析
  • 如何选择南京管道疏通服务?2026年推荐与评测解决效率低下痛点 - 十大品牌推荐
  • 怎么部署OpenClaw?2026年京东云搭建基础教学
  • 【数据结构】冒泡排序
  • 2026年南宁管道疏通推荐:多场景管道问题解决痛点全面评价与指南 - 十大品牌推荐
  • 2026年评价高的荔枝面花岗岩地铺石,光面地铺石厂家推荐榜单 - 品牌鉴赏师
  • 【数据结构】希尔排序
  • 长期记忆在内容生成AI中的关键作用与技术实现
  • Agent 设计模式全攻略(非常详细),20个大厂方案从入门到精通,收藏这一篇就够了!
  • 2026年宁波管道疏通推荐:多场景疏通服务评价,解决堵塞与溢流核心痛点 - 品牌推荐
  • 实用指南:【专辑】AI大模型应用开发入门-拥抱Hugging Face与Transformers生态 - 基于BERT文本分类模型微调
  • 工业设备智能诊断系统实战全攻略(非常详细),LangGraph+MCP+Chainlit从入门到精通,收藏这一篇就够了!
  • 2026年宁波管道疏通推荐:基于多场景实测评价,解决堵塞与返水核心痛点 - 品牌推荐
  • 管道堵塞如何高效处理?2026年南通管道疏通服务推荐与全面评价 - 品牌推荐
  • 如何选择南通管道疏通服务?2026年全面评测与推荐,解决效率与安全痛点 - 品牌推荐
  • 学习进度 25
  • 2026年南京管道疏通推荐:市政与家庭场景全面评测,解决紧急疏通与长期维护痛点 - 十大品牌推荐
  • 《计算机是怎样跑起来的》————通向计算机世界的三把钥匙
  • C++与C#中的参数传递方式:值传递、地址传递、引用传递