当前位置: 首页 > news >正文

实验三

任务一:导入数据

  • 创建两个数据表
    • Hive QL代码
      USE default;  -- 选择使用名为 default 的数据库DROP TABLE IF EXISTS ccf_offline_stage1_train;
      CREATE TABLE ccf_offline_stage1_train (user_id STRING,merchant_id STRING,coupon_id STRING,discount_rate STRING,distance STRING,date_received STRING,`date` STRING
      ) -- 创建新表,包含7个字段
      -- 这里 data 要打 `` 的原因是data是关键字
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','  -- 指定数据文件使用逗号(,)作为字段分隔符(CSV格式)
      STORED AS TEXTFILE  -- 数据以文本文件格式存储
      TBLPROPERTIES ("skip.header.line.count"="1"
      ); -- 表属性设置:跳过外部数据文件的第一行(通常是标题行)DROP TABLE IF EXISTS ccf_online_stage1_train;
      CREATE TABLE ccf_online_stage1_train (user_id STRING,merchant_id STRING,action STRING,coupon_id STRING,discount_rate STRING,date_received STRING,`date` STRING
      )
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','
      STORED AS TEXTFILE
      TBLPROPERTIES ("skip.header.line.count"="1"
      );LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_offline_stage1_train.csv'
      -- 从本地(LOCAL)文件系统(不是HDFS)加载文件
      OVERWRITE INTO TABLE ccf_offline_stage1_train;
      -- 覆盖写入新数据到之前的表中LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_online_stage1_train.csv'
      OVERWRITE INTO TABLE ccf_online_stage1_train;
      
    • 执行结果
      image
  • 查询验证数据加载成功
    • Hive QL代码
      USE default;SELECT * FROM ccf_offline_stage1_train LIMIT 5;
      -- 从 ccf_offline_stage1_train 表中查询所有字段(* 表示所有列),只返回前五列
      SELECT * FROM ccf_online_stage1_train LIMIT 5;
      SELECT COUNT(*) FROM ccf_offline_stage1_train;
      -- 统计 ccf_offline_stage1_train 表中的总行数
      SELECT COUNT(*) FROM ccf_online_stage1_train;
    • 执行结果
      • 查询
        image
      • 计数
        image
        image

任务二:基本数据查询

  • 查询⽤户⾏为数量
    • Hive QL代码
      USE default;SELECTCASE actionWHEN '0' THEN 'CLICK'WHEN '1' THEN 'BUY'WHEN '2' THEN 'RECEIVE'ELSE CONCAT('UNKNOWN_', action)END AS behavior, -- 将原始的动作编码转换为有意义的英文名称COUNT(*) AS total_count -- 计算每种行为出现的总次数
      FROM ccf_online_stage1_train -- 从名为 ccf_online_stage1_train 的数据表中读取数据
      GROUP BY action -- 按原始的动作编码进行分组统计
      ORDER BY total_count DESC; -- 按行为数量从高到低排序
      
    • 举例
      • 假设原始数据是:0,0,1,0,2,1
      • 分组之后
        组1 (action='0'): 记录1, 记录2, 记录4
        组2 (action='1'): 记录3, 记录6
        组3 (action='2'): 记录5
        
      • 对每个组分别统计行数
        组1: COUNT(*) = 3
        组2: COUNT(*) = 2  
        组3: COUNT(*) = 1
        
    • 执行结果
      image
  • 查询指定商家优惠券使⽤情况
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS onlineconsumption_table;
      CREATE TABLE onlineconsumption_table AS -- 这里的AS是创建表的语法关键字,表示"基于查询结果创建表"
      SELECTmerchant_id,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NULL THEN 1 ELSE 0 END) AS negative_samples,SUM(CASE WHEN coupon_id IS NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS normal_consumption,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS positive_samples
      FROM ccf_online_stage1_train
      GROUP BY merchant_id;SELECT * FROM onlineconsumption_table LIMIT 20;
      
    • 执行结果
      image

任务三:数据聚合分析

  • 商家周边活跃顾客数量统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS merchant_distance_active_customers;
      CREATE TABLE merchant_distance_active_customers AS
      SELECTmerchant_id,NVL(distance, 'UNKNOWN') AS distance_bucket,COUNT(DISTINCT user_id) AS active_customer_count
      FROM ccf_offline_stage1_train
      GROUP BY merchant_id, NVL(distance, 'UNKNOWN');SELECTmerchant_id,distance_bucket,active_customer_count
      FROM merchant_distance_active_customers
      ORDER BY merchant_id, distance_bucket
      LIMIT 50;
      
    • 执行结果
      image
      image
      其中第一列是商家ID,第二列是距离x,第三列是数目
  • 商家正样本比例统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS merchant_positive_ratio;
      CREATE TABLE merchant_positive_ratio AS
      SELECTmerchant_id,positive_samples,negative_samples,normal_consumption,(negative_samples + normal_consumption + positive_samples) AS total_samples,CASEWHEN (negative_samples + normal_consumption + positive_samples) > 0 THENpositive_samples / CAST((negative_samples + normal_consumption + positive_samples) AS DOUBLE)ELSE 0.0END AS positive_ratio
      FROM onlineconsumption_table;SELECTmerchant_id,positive_ratio,positive_samples,total_samples
      FROM merchant_positive_ratio
      ORDER BY positive_ratio DESC, positive_samples DESC
      LIMIT 10;
      
    • 执行结果
      image

任务四:复杂查询与分析

  • 优惠券使用时间统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS coupon_usage_interval_stats;
      CREATE TABLE coupon_usage_interval_stats AS
      WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN date_received IS NOT NULL AND date_received != 'null' THEN date_received ELSE NULL END AS date_receivedFROM ccf_offline_stage1_train
      ),
      coupon_usage AS (SELECTcoupon_id,DATEDIFF(TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_consumed, 'yyyyMMdd'))),TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_received, 'yyyyMMdd')))) AS usage_intervalFROM offline_cleanWHERE coupon_id IS NOT NULLAND date_consumed IS NOT NULLAND date_received IS NOT NULL
      ),
      usage_counts AS (SELECTcoupon_id,COUNT(*) AS usage_count,AVG(usage_interval) AS avg_usage_intervalFROM coupon_usageGROUP BY coupon_id
      ),
      total_usage AS (SELECT SUM(usage_count) AS total_usage_count FROM usage_counts
      )
      SELECTu.coupon_id,u.usage_count,ROUND(u.avg_usage_interval, 2) AS avg_usage_interval
      FROM usage_counts u
      CROSS JOIN total_usage t
      WHERE u.usage_count > t.total_usage_count * 0.01;SELECTcoupon_id,usage_count,avg_usage_interval
      FROM coupon_usage_interval_stats
      ORDER BY usage_count DESC, coupon_id
      LIMIT 100;
      
    • 执行结果
      image
  • 优惠券折扣率统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS coupon_usage_rate_stats;
      CREATE TABLE coupon_usage_rate_stats AS
      WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN discount_rate IS NOT NULL AND discount_rate != 'null' THEN discount_rate ELSE NULL END AS discount_rateFROM ccf_offline_stage1_train
      ),
      coupon_totals AS (SELECTcoupon_id,MAX(discount_rate) AS discount_rate,COUNT(*) AS total_count,SUM(CASE WHEN date_consumed IS NOT NULL THEN 1 ELSE 0 END) AS used_countFROM offline_cleanWHERE coupon_id IS NOT NULLGROUP BY coupon_id
      ),
      final_stats AS (SELECTcoupon_id,used_count,total_count,CASE WHEN total_count > 0 THEN used_count / CAST(total_count AS DOUBLE) ELSE 0.0 END AS usage_rate,CASEWHEN discount_rate LIKE '%:%' THEN 'threshold'ELSE 'direct'END AS discount_type,CASEWHEN discount_rate LIKE '%:%' THENCASEWHEN CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE) > 0 THEN(CAST(SPLIT(discount_rate, ':')[1] AS DOUBLE) / CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE)) * 100ELSE NULLENDELSE(1 - CAST(discount_rate AS DOUBLE)) * 100END AS discount_percentageFROM coupon_totals
      )
      SELECTcoupon_id,ROUND(usage_rate, 4) AS usage_rate,used_count,total_count,ROUND(discount_percentage, 2) AS discount_percentage_value,discount_type
      FROM final_stats;SELECTcoupon_id,usage_rate,used_count,CONCAT(CAST(discount_percentage_value AS STRING), '%') AS discount_percentage,discount_type
      FROM coupon_usage_rate_stats
      ORDER BY usage_rate DESC, used_count DESC, coupon_id
      LIMIT 10;
      
    • 执行结果
      image
http://www.jsqmd.com/news/64220/

相关文章:

  • 实验五
  • 2025年12月新能源汽车轮胎推荐:最新电车胎精选指南
  • 2025年美国投行求职机构哪家高效不爆雷:助学员成本降60%offer量产
  • Go 语言(Golang):核心特性、生态优势与实战应用全解析
  • 详细介绍:《Windows 服务器 WinSCP 保姆级配置指南:从 0 到 1 实现 “无痛” 远程文件管理》
  • 多项式学习笔记
  • Kubernetes(K8s):核心概念、架构与实战应用全解析
  • 从零到实战:Go 语言高效学习路线
  • 每个人都在追寻远方,那远方的人是否也有自己的远方呢?
  • 2025年12月美国投行求职机构哪家好:数据揭晓98%靠谱专业的机构
  • 2025年12月安全的轮胎推荐:专业安全胎权威指南
  • SUV车型轮胎推荐:权威SUV胎专业推荐
  • 抑郁症治疗指南
  • 4. 垃圾回收机制(GC)
  • “游戏无法启动”、“DLL文件丢失”或“缺少组件”怎么办
  • 家用轿车轮胎推荐:十大家轿胎深度榜单
  • 舒适的轮胎推荐:TOP10舒适胎专业测评
  • Less-8 GET-Blind-Boolean Based-Single Quotes - 详解
  • 2025年节油的轮胎推荐:权威省油胎最新榜单
  • 2025年丰田凯美瑞更换轮胎推荐:权威轮胎推荐必读攻略
  • 2025年操控的轮胎推荐:十大操控胎深度解析
  • 第3章栈和队列
  • 2025年本田雅阁更换轮胎推荐:专业轮胎选择深度解析
  • 奔跑
  • 论文写作辅助必备!7款AI工具让你轻松搞定论文,查重无忧
  • 运动补偿中的距离对准技术:原理、方法与应用
  • 记一次Sqlserver数据库存储过程调用导致的连接池耗尽事件
  • 2025/12/6下午计划
  • 12.6
  • 2025年下半年上海ISO27001认证机构综合评估与选择指南