基于kaggle欧洲国家太阳能发电数据集的太阳能站点效率预测

我们将只保留一个站点,使用 -learn 的基本 ML 模型进行一个月的预测,使用深度学习和预测一到两天 。
性能指标:均方根误差,探索性分析可见,数据集是干净的:没有异常值,没有重复行,也没有缺失值 。
1、基线模型
基线模型得到的结果,将会是其他模型结果的比较基准 。
import numpy as npimport pandas as pdfrom sklearn.metrics import mean_squared_errorfrom dataprepare import dataset_confrom visualize import plot_scores,plot_predictionsimport warningswarnings.filterwarnings("ignore")pd.options.display.max_columns = 300##原始数据df = pd.read_csv("dataset\solar_generation_by_station.csv")train_data,test_data = http://www.kingceram.com/post/dataset_con(df)model_instances, model_names, rmse_train, rmse_test = [], [], [], []#构造训练集和测试集x_train, y_train = train_data.drop(columns=['time']), train_data['FR10']x_test, y_test = test_data.drop(columns=['time']), test_data['FR10']# 基线模型,作为基准模型def mean_df(d, h):"return the hourly mean of a specific day of the year"res = x_train[(x_train['day'] == d) & (x_train['hour'] == h)]['FR10'].mean()return res#预测值添加到数据集x_train['pred'] = x_train.apply(lambda x: mean_df(x.day, x.hour), axis=1)x_test['pred'] = x_test.apply(lambda x: mean_df(x.day, x.hour), axis=1)model_names.append("base_line")rmse_train.append(np.sqrt(mean_squared_error(x_train['FR10'], x_train['FR10']))) # a modifier en predrmse_test.append(np.sqrt(mean_squared_error(x_test['FR10'], x_test['pred'])))#显示上个月的预测(橙色)和实际值(蓝色)plot_predictions(data=http://www.kingceram.com/post/x_test[['FR10', 'pred']])
2、回归模型
下面会利用几种回归模型进行预测 。将通过比较测试集上的性能,来判断哪个模型最有效 。
import numpy as npimport pandas as pdfrom sklearn.metrics import mean_squared_errorfrom dataprepare import dataset_confrom visualize import plot_scores,plot_predictionsimport warningswarnings.filterwarnings("ignore")pd.options.display.max_columns = 300##原始数据df = pd.read_csv("F:\mygithub\Big_Data_Renewable_energies-master\dataset\solar_generation_by_station.csv")train_data,test_data = http://www.kingceram.com/post/dataset_con(df)model_instances, model_names, rmse_train, rmse_test = [], [], [], []#构造训练集和测试集X_train, y_train = train_data[['month', 'week', 'day', 'hour']], train_data['FR10']X_test, y_test = test_data[['month', 'week', 'day', 'hour']], test_data['FR10']#训练的模型from sklearn.neighbors import KNeighborsRegressor#k近邻from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet#线性回归,岭回归,Lasso回归,弹性网络from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressorfrom sklearn.svm import LinearSVRfrom sklearn.svm import SVRimport xgboost as xgb#打印分数def get_rmse(reg, model_name):"""打印传入参数的模型的分数以及并返回训练/测试集上的分数"""y_train_pred, y_pred = reg.predict(X_train), reg.predict(X_test)rmse_train, rmse_test = np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_pred))print(model_name, '\t - RMSE on Training= {rmse_train:%f}'%rmse_train+' / RMSE on Test = {rmse_test:}'%rmse_test)return rmse_train, rmse_test# 最初使用的所有基本模型的列表model_list = [LinearRegression(), Lasso(), Ridge(), ElasticNet(),RandomForestRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(),xgb.XGBRegressor(), KNeighborsRegressor()]# 训练和测试的分数和名字列表创建model_names.extend([str(m)[:str(m).index('(')] for m in model_list])# 训练和测试所有模型for model, name in zip(model_list, model_names):model.fit(X_train, y_train)sc_train, sc_test = get_rmse(model, name)rmse_train.append(sc_train)rmse_test.append(sc_test)
结果比较
base_line- RMSE on Training= 0.21 / RMSE on Test = 0.15LinearRegression- RMSE on Training= 0.21 / RMSE on Test = 0.15Lasso- RMSE on Training= 0.21 / RMSE on Test = 0.15Ridge- RMSE on Training= 0.21 / RMSE on Test = 0.15ElasticNet- RMSE on Training= 0.10 / RMSE on Test = 0.10RandomForestRegressor- RMSE on Training= 0.11 / RMSE on Test = 0.09GradientBoostingRegressor- RMSE on Training= 0.10 / RMSE on Test = 0.10ExtraTreesRegressor- RMSE on Training= 0.11 / RMSE on Test = 0.09XGBRegressor- RMSE on Training= 0.10 / RMSE on Test = 0.10LGBMRegressor- RMSE on Training= 0.10 / RMSE on Test = 0.10
3、深度学习
尝试根据过去 2 天(48 小时)的所有特征(所有其他站效率)预测一小时的 FR10 值 。
3.1 数据集构建
【基于kaggle欧洲国家太阳能发电数据集的太阳能站点效率预测】df = pd.read_csv("dataset\solar_generation_by_station.csv")df = df[sorted([c for c in df.columns if 'FR' in c])]# 只保留最近4年的FR数据df = df[-24*365*4:]# 数据处理函数:输入为df和lookback,输出的X的各个元素为4年来每个48小时的数据def process_data(data, past):X = []for i in range(len(data)-past-1):X.append(data.iloc[i:i+past].values)return np.array(X)#根据过去2天的特征值预测之后1个小时的值lookback = 48#仅针对FR10这个站点进行预测,y为FR10第一个48小时后的所有数据值,X的元素为y对应的数据值之前的48小时数据y = df['FR10'][lookback+1:]X = process_data(df, lookback)from sklearn.model_selection import train_test_split#划分训练集和测试集,不打乱X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
RNN,LSTM,GRU模型构建、训练与测试
'''RNN'''from keras.models import Sequentialfrom keras.layers import SimpleRNN, Dense, Embedding, Dropout# from tensorflow.keras.models import Sequential# from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropoutdef my_RNN():my_rnn = Sequential()my_rnn.add(SimpleRNN(units=32, return_sequences=True, input_shape=(lookback,22)))my_rnn.add(SimpleRNN(units=32, return_sequences=True))my_rnn.add(SimpleRNN(units=32, return_sequences=False))my_rnn.add(Dense(units=1, activation='linear'))return my_rnnrnn_model = my_RNN()rnn_model.compile(optimizer='adam', loss='mean_squared_error')rnn_model.fit(x=X_train, y=y_train, validation_data=http://www.kingceram.com/post/(X_test, y_test), epochs=50, batch_size=64)y_pred_train, y_pred_test = rnn_model.predict(X_train), rnn_model.predict(X_test)err_train_rnn, err_test_rnn = np.sqrt(mean_squared_error(y_train, y_pred_train)), np.sqrt(mean_squared_error(y_test, y_pred_test))def append_results(model_name,err_train,err_test):model_names.append(model_name)rmse_train.append(err_train)rmse_test.append(err_test)append_results("RNN",err_train_rnn,err_test_rnn)plot_evolution(X_train,y_train,X_test,y_test,y_pred_test)rnn_res = pd.DataFrame(zip(list(y_test), list(np.squeeze(y_pred_test))), columns =['FR10', 'pred'])plot_predictions(data=http://www.kingceram.com/post/rnn_res[-30*24:])'''GRU'''from keras.layers import GRUdef my_GRU(input_shape):my_GRU = Sequential()my_GRU.add(GRU(units=32, return_sequences=True, activation='relu', input_shape=input_shape))my_GRU.add(GRU(units=32, activation='relu', return_sequences=False))my_GRU.add(Dense(units=1, activation='linear'))return my_GRUgru_model = my_GRU(X.shape[1:])gru_model.compile(optimizer='adam', loss='mean_squared_error')gru_model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)y_pred_train, y_pred_test = gru_model.predict(X_train), gru_model.predict(X_test)err_train_gru, err_test_gru = np.sqrt(mean_squared_error(y_train, y_pred_train)), np.sqrt(mean_squared_error(y_test, y_pred_test))append_results("GRU",err_train_gru,err_test_gru)plot_evolution(X_train,y_train,X_test,y_test,y_pred_test)gru_res = pd.DataFrame(zip(list(y_test), list(np.squeeze(y_pred_test))), columns =['FR10', 'pred'])plot_predictions(data=http://www.kingceram.com/post/gru_res[-30*24:])'''LSTM'''from keras.layers import LSTMdef my_LSTM(input_shape):my_LSTM = Sequential()my_LSTM.add(LSTM(units=32, return_sequences=True, activation='relu', input_shape=input_shape))my_LSTM.add(LSTM(units=32, activation='relu', return_sequences=False))my_LSTM.add(Dense(units=1, activation='linear'))return my_LSTMlstm_model = my_LSTM(X.shape[1:])lstm_model.compile(optimizer='adam', loss='mean_squared_error')lstm_model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)y_pred_train, y_pred_test = lstm_model.predict(X_train), lstm_model.predict(X_test)err_train_lstm, err_test_lstm = np.sqrt(mean_squared_error(y_train, y_pred_train)), np.sqrt(mean_squared_error(y_test, y_pred_test))append_results("LSTM",err_train_lstm,err_test_lstm)plot_evolution(X_train,y_train,X_test,y_test,y_pred_test)lstm_res = pd.DataFrame(zip(list(y_test), list(np.squeeze(y_pred_test))), columns =['FR10', 'pred'])plot_predictions(data=http://www.kingceram.com/post/lstm_res[-30*24:])plt.style.use('fivethirtyeight')plot_scores(model_names,rmse_train,rmse_test)df_score = pd.DataFrame({'model_names' : model_names, 'rmse_test' : rmse_test})plt.figure(figsize=(12, 8))sns.barplot(y="model_names", x="rmse_test", data=http://www.kingceram.com/post/df_score, palette="Blues_d")plt.title("Comparaison des erreurs pour chaque modèle", fontsize=20)plt.xlabel('erreur RMSE - plus elle est petite, meilleur est le modèle', fontsize=16)plt.ylabel('liste des modèles esssayés', fontsize=16)plt.show()
所有模型结果