基于澳大利亚气象数据集可视化分析及降雨预测

找专业人士帮助你

1：探索性分析代码流程

2：数字特征处理

3：非数字类型的特征处理

4：特征拼接

5：标签处理

6：数据集划分

7：模型评价

8：数据可视化

仅供大家参考学习，有不足之处请多多包涵与批评，指导指导我，切勿完全照搬采用，需要源码请下载压缩包。

import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from imblearn.over_sampling import SMOTE from matplotlib import style, test import seaborn as sns from sklearn import preprocessing, __all__ from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression # 逻辑回归 from sklearn.ensemble import RandomForestClassifier # 随机森林 from sklearn import tree # 决策树 from sklearn import metrics # 评价的指标 style.use('ggplot') # 设置图片显示的主题样式 # 解决matplotlib显示中文问题 plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 1.探索性数据分析的流程 def inspect_data(df_data): # 1.1查看数据的前５行 print("查看数据的前５行") print(df_data.head()) print("*"*60) # 1.2查看数据的后5行 print("查看数据的后5行") print(df_data.tail()) print("*"*60) # 1.3显示数据的基本信息 print("显示数据的基本信息") print(df_data.info()) print("*"*60) # 1.4显示数据的统计信息 print("显示数据的统计信息") print(df_data.describe()) print("*" * 60) # 1.5查看空值NAN print("判断哪些“列”存在缺失值") print(df_data.isnull().any()) # 判断哪些“列”存在缺失值 print("*" * 60) print("找出含有nan的所有行") print(df_data[df_data.isnull().T.any().T]) # 找出含有nan的所有行 print("*" * 60) # 1.6空值处理，删除空值所在的行 print("空值处理，删除空值所在的行") print(df_data.dropna()) print("*" * 60) """ "axis":0代表行操作（默认），1代表列操作 “how”:any表示只有空值就删除（默认），all表示全部为空值才删除 ”inplace“:False表示返回新的数据集，即生成一个副本数据集（默认），True表示在原数据集上操作 """ # 2. 数据的分析及画图（pandas） def analysis_data(df_data): use_cols = ['Date', 'Location', 'MinTemp', 'MaxTemp','Rainfall', 'Sunshine','Evaporation','RainToday','Pressure9am','Pressure3pm', 'WindGustDir','WindGustSpeed','RISK_MM', 'RainTomorrow'] use_data = df_data[use_cols] print("数据分析总览，查看使用列数据的前10行") print(use_data.head(10)) print("*" * 60) # 按照月份记录降雨量 print("*" * 60) print("时间类型转换.....") # 1.处理时间特征，把日期转换为月份 use_data['Date'] = pd.to_datetime(use_data['Date']) use_data['Date'] = use_data['Date'].dt.month print(use_data.head()) print("*" * 60) # 2.可视化，月份VS降雨量 fig1 = plt.figure() ax = fig1.add_subplot(1, 1, 1) ax.scatter(use_data['Date'], use_data['Rainfall']) ax.set_ylabel('Rainfall') ax.set_xlabel('Month') plt.show() def analysis_data1(df_data): # 可视化 今天下雨样本比例 df_data['RainToday'].value_counts().plot(kind='pie', autopct='%.2f%%') plt.title('RainToday样本比例') plt.tight_layout() plt.show() def analysis_data2(df_data): data = np.random.rand(4, 2) rows = ['MinTemp','MaxTemp','Rainfall','RISK_MM'] # rows categories columns = ['RainToday','RainTomorrow'] # column categories fig, ax = plt.subplots() # Advance color controls ax.pcolor(data, cmap=plt.cm.Reds, edgecolors='k') ax.set_xticks(np.arange(0, 2) + 0.5) ax.set_yticks(np.arange(0, 4) + 0.5) # Here we position the tick labels for x and y axis ax.xaxis.tick_bottom() ax.yaxis.tick_left() # Values against each labels ax.set_xticklabels(columns, minor=False, fontsize=20) ax.set_yticklabels(rows, minor=False, fontsize=10) plt.show() def analysis_data3(df_data): sns.displot(data=df_data, x='RainTomorrow', hue='RainToday', multiple="stack", height=6, aspect=1) plt.show() def analysis_data4(df_data): plt.figure(figsize=(12, 6)) sns.scatterplot(data=df_data.sample(2022), x='MinTemp', y='MaxTemp', hue='RainTomorrow'); plt.show() # 处理标签数据 def create_label(RainTomorrow_val): label = 1 # 标签为1代表了不下雨 if RainTomorrow_val == 'Yes': label = 0 return label # 处理数据 def process_data(df_data): filter_mask = df_data['RainTomorrow'].isin(['Yes', 'No']) filter_data = df_data[filter_mask] print(filter_data['RainTomorrow'].value_counts()) print("*" * 60) # 为数据添加 0, 1 标签，'Yes' -> 0, No -> 1 proc_filter_data = filter_data.copy() proc_filter_data['label'] = filter_data['RainTomorrow'].apply(create_label) print(proc_filter_data.head()) print("*" * 60) # Porj 2.2: 可视化 正负样本比例 proc_filter_data['label'].value_counts().plot(kind='pie', autopct='%.2f%%') plt.title('RainTomorrow正负样本比例') plt.tight_layout() plt.show() numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM'] category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday'] label_col = ['label'] user_cols = numeric_cols + category_cols + label_col final_samples = proc_filter_data[user_cols] # 去掉空值 final_samples.dropna(inplace=True) proc_data_filepath = 'proc_data.csv' final_samples.to_csv(os.path.join(proc_data_filepath), index=False) def perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col): """ 数据集处理及模型学习 理解，准确率，精确率，召回率三者之间的关系 参数 ====== data_filepath: 数据集路径 numeric_cols: 数值类型列 category_cols: 类别类型列 label_col: 标签列 返回值 ====== None """ data = pd.read_csv(data_filepath) numeric_feat = data[numeric_cols].values category_val = data[category_cols].values[:, 0] # 如果有多列，每次处理一列 # 处理类别数据 # label encoder label_enc = preprocessing.LabelEncoder() label_val = label_enc.fit_transform(category_val) label_val = label_val.reshape(-1, 1) # one-hot encoder 独热编码 onehot_enc = preprocessing.OneHotEncoder() category_feat = onehot_enc.fit_transform(label_val) category_feat = category_feat.toarray() # 生成最终特征和标签用于模型的训练 X = np.hstack((numeric_feat, category_feat)) y = data[label_col].values # 数据集信息 n_sample = y.shape[0] n_pos_sample = y[y == 1].shape[0] n_neg_sample = y[y == 0].shape[0] print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample, n_pos_sample / n_sample, n_neg_sample / n_sample)) print('特征维数：', X.shape[1]) # 处理不平衡数据 oversample = SMOTE() X, y = oversample.fit_resample(X, y) print('通过SMOTE方法平衡正负样本后') n_sample = y.shape[0] n_pos_sample = y[y == 1].shape[0] n_neg_sample = y[y == 0].shape[0] print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample, n_pos_sample / n_sample, n_neg_sample / n_sample)) # 分割数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #C为超参数，尝试使用交叉验证选取最优的C值 #lr_model = LogisticRegression(C=1.0) # rf_model = RandomForestClassifier() # clf = tree.DecisionTreeClassifier() # 决策树 clf = LogisticRegression() # 逻辑回归 # clf = RandomForestClassifier() # 随机森林 clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = metrics.accuracy_score(y_pred, y_test) precision = metrics.precision_score(y_pred, y_test, pos_label=1) recall = metrics.recall_score(y_pred, y_test, pos_label=1) print('准确率为：', accuracy) print('精确率为：', precision) print('召回率：', recall) def main(): csvfile = "weatherAUS.csv" raw_data = pd.read_csv(csvfile) inspect_data(raw_data) analysis_data(raw_data) analysis_data1(raw_data) analysis_data2(raw_data) analysis_data3(raw_data) analysis_data4(raw_data) process_data(raw_data) numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM'] # 数字列 category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday'] # 非数字列 label_col = ['label'] # 标签 （要用来进行预测的内容） data_filepath = 'proc_data.csv' perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col) if __name__ == '__main__': main() # 小谢编写，有不足请指正与包涵

找专业人士帮助你

原文链接：https://blog.csdn.net/m0_62909438/article/details/127158166?ops_request_misc=&request_id=3aaa6e03f64d4a2782b103a1fef4ac1f&biz_id=&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~koosearch~default-23-127158166-null-null.268%5Ev1%5Econtrol&utm_term=%E6%BE%B3%E6%B4%B2%E7%94%9F%E6%B4%BB

基于澳大利亚气象数据集可视化分析及降雨预测

作者: 知澳头条

发表回复取消回复

联系我们

微信扫一扫关注我们

给这篇文章的作者打赏

作者: 知澳头条

相关文章

AI，正在疯狂进化，金融大模型来了

案例告诉你 ChatGPT 最有可能取代哪些职业

‎AplikacjaChatHER

中国没有ChatGPT

中国没有ChatGPT

这家公司打算用AI技术普及法律咨询服务｜专访

发表回复 取消回复

联系我们

微信扫一扫关注我们

发表回复取消回复