当前位置：首页 > news >正文

分析和预测快速约会中双方能否成功配对

news 2026/7/2 14:21:10

第一部分：数据清洗与预处理
代码首先从一个文本文件中读取了33个变量的名称，为后续读取数据做准备。然后，它加载了包含7644条记录的原始CSV文件。在数据清洗环节，代码删除了所有包含缺失值的记录，使数据量减少到978条。最后，它将文本形式的性别、是否同种族以及是否匹配等字段，统一转换成了机器学习模型可以处理的0/1数值格式。

import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report, confusion_matrix import seaborn as sns from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from matplotlib.ticker import FormatStrFormatter # ===== Data cleaning part ===== with open("variables.txt", 'r', encoding='utf-8') as file: lines = file.readlines()[2:] # Skip the first two rows # Extract the first word of each line as the variable name variables = [line.strip().split()[0] for line in lines if line.strip() and not line.startswith('VariablesName')] print("List of variables:", variables) print("Number of variables:", len(variables)) # Read CSV files df = pd.read_csv('SpeedDating.csv', header=None, names=variables) # Displays the number of raw data observation records print(f"Number of raw data observation records: {len(df)}") # Delete records with missing values and make replicas df_cleaned = df.dropna().copy() # Shows the number of observations remaining after removing missing values print(f"Number of observations remaining after removing missing values: {len(df_cleaned)}") # Convert the categorical variable "gender" to binary form # Convert the gender column to a string and remove the b "symbol df_cleaned['gender'] = df_cleaned['gender'].astype(str).str.replace("b'", "").str.replace("'", "") # Change male to 1 and female to 0 df_cleaned['gender'] = df_cleaned['gender'].map({'male': 1, 'female': 0}) # Convert the samerace and match columns to integers df_cleaned["samerace"] = df_cleaned["samerace"].apply(lambda x: 1 if x == "b'1'" else 0) df_cleaned["match"] = df_cleaned["match"].apply(lambda x: 1 if x == "b'1'" else 0) # The processed complete data set is stored as a new CSV file df_cleaned.to_csv('dataforanalysis.csv', index=False, encoding='utf-8')

第二部分：探索性数据分析与特征工程
在数据准备好之后，代码开始进行探索性分析。它计算了所有数值变量的描述性统计量，并通过绘制直方图来直观展示数据的分布情况。更重要的是，代码根据业务理解创造了一些新的、更具预测力的特征。例如，它计算了参与者和对象之间的年龄差，并绘制了年龄差的分布图。此外，它还结合参与者对各项特质的重视程度，以及对对象的实际评分，生成了六个新的“相对分数”特征。

# ===== Exploratory data analysis ===== # Reading data df1 = pd.read_csv("dataforanalysis.csv") # Select numeric columns and exclude categorical variables exclude_cols = ['gender', 'samerace', 'match'] numeric_cols = df1.select_dtypes(include=['number']).columns.difference(exclude_cols) # Descriptive statistics desc_stats = df1[numeric_cols].describe().T print(desc_stats) # Setting the drawing style sns.set(style="whitegrid") # Plot histograms of all numeric variables fig, axes = plt.subplots(nrows=len(numeric_cols)//3, ncols=3, figsize=(18, 5*len(numeric_cols)//3)) axes = axes.flatten() for i, col in enumerate(numeric_cols): df[col].hist(bins=20, ax=axes[i], color='skyblue', edgecolor='black') axes[i].set_title(f'Histogram of {col}') axes[i].set_xlabel(col) axes[i].set_ylabel('Frequency') plt.tight_layout() plt.show() # Calculate the absolute value of the age difference (make sure it's positive) df1['age_diff'] = (df1['age_o'] - df1['age']).abs() # Shows the age difference of all participants print("Difference in age：") print(df1['age_diff']) # The number of records is grouped by age difference age_diff_counts = df1['age_diff'].value_counts().sort_index() # Plot the bar chart plt.figure(figsize=(10, 6)) plt.bar(age_diff_counts.index, age_diff_counts.values, width=0.8, color='darkblue') # Set the chart title and label plt.title('Age Difference Distribution', fontsize=14) plt.xlabel('Age Difference Groups', fontsize=12) plt.ylabel('Number of Observations', fontsize=12) # Adjust the X-axis scale plt.xticks(range(int(age_diff_counts.index.min()), int(age_diff_counts.index.max()) + 1)) # Sets the X-axis scale to one-decimal format plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%.1f')) plt.grid(False) # Close grid lines (original image without grid) # Displaying the chart plt.tight_layout() plt.show() # Get the data for the first participant first_participant = df1.iloc[0] # Extract the importance score column name importance_cols = [ 'attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important' ] # Obtain the importance score of the first participant importance_scores = first_participant[importance_cols].values # Drawing pie charts plt.figure(figsize=(10, 8)) colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD'] wedges, texts, autotexts = plt.pie( importance_scores, labels=importance_cols, autopct='%1.1f%%', colors=colors, startangle=90, textprops={'fontsize': 12} ) # Beautify the chart plt.title('The first participant assigned importance to partner traits', fontsize=16, fontweight='bold', pad=20) plt.axis('equal') # Make sure the pie chart is circular plt.tight_layout() plt.show() # Relative scores are calculated and added to the data frame partner_cols = [ 'attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner' ] # Calculate the relative score for each trait for importance_col, partner_col in zip(importance_cols, partner_cols): relative_col_name = f'relative_{importance_col.split("_")[0]}_score' df1[relative_col_name] = df1[partner_col] * (df1[importance_col] / 100) # Six new column calculations for all participants are displayed relative_cols = [ 'relative_attractive_score', 'relative_sincere_score', 'relative_intelligence_score', 'relative_funny_score', 'relative_ambition_score', 'relative_shared_score' ] # The relative scores of all participants are displayed print("Results of relative scores calculated for all participants：") print(df1[relative_cols]) # Generate a new filename new_filename = f"enhanced_dataset.csv" # Save to a new file df1.to_csv(new_filename, index=False) print(f"\nThe new file containing the original content, age difference, and relative score has been saved as: {new_filename}")

第三部分：预测模型构建与评估
最后，代码利用处理好的数据构建了一个机器学习模型。它将数据分为训练集和测试集，训练了一个决策树模型来预测约会是否成功。模型在测试集上达到了约82%的准确率。最后，通过输出详细的分类报告和绘制混淆矩阵，代码对模型的性能进行了全面评估，揭示了它对“匹配成功”这个少数类别的预测效果其实一般，从而为后续的模型优化指明了方向。

# ===== Data prediction part ===== # Reading data df2 = pd.read_csv("enhanced_dataset.csv") # Prepare the feature and target (Note: you need to define X and y first, otherwise train_test_split will give you an error) X = df2.drop('match', axis=1) # Features y = df2['match'] # Target variable # Split the training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Defining the prediction function def simple_match_prediction(df, model_type): # Selection model if model_type == 'decision_tree': model = DecisionTreeClassifier(random_state=42, max_depth=5) else: raise ValueError("Unsupported model types") # Training the model model.fit(X_train, y_train) # Prediction and evaluation y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) # Print the result (the result will be displayed here) print(f"Model: {model_type}") print(f"Accuracy: {accuracy:.4f}") return model, accuracy # Testing the decision tree model model_dt, acc_dt = simple_match_prediction(df2, model_type='decision_tree') # Define the evaluation function (your quick_evaluation function) def quick_evaluation(model, X_test, y_test, model_name): # Forecasting y_pred = model.predict(X_test) # Classification report print(f"\n{model_name} - Classification report:") print(classification_report(y_test, y_pred, target_names=['mismatch', 'match'])) # Confusion matrix plt.figure(figsize=(6, 5)) cm = confusion_matrix(y_test, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title(f'{model_name} - Confusion matrix') plt.ylabel('Real label') plt.xlabel('Predicting labels') plt.show() # Calculating the main indicators accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print(f"\nMain indicators:") print(f"Accuracy: {accuracy:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F1 score: {f1:.4f}") return { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'predictions': y_pred } # Train a model (decision tree example) model = DecisionTreeClassifier(max_depth=5, random_state=42) # Initializing the model model.fit(X_train, y_train) # Train the model with the training set # Call the evaluation function, passing in the model, test set, and model name evaluation_results = quick_evaluation( model=model, X_test=X_test, y_test=y_test, model_name="Decision tree model" # Custom model name, easy to distinguish )

查看全文

http://www.jsqmd.com/news/468499/