提交 783cdd1f 编写于 作者: H hypox64

0.11953

上级 24336635
import dataloader
import transformer
import numpy as np
correlates = []
desc_map,price_map = dataloader.load_train()
price_map['price'] = transformer.normlize(price_map['price'])
key = ''
desc_map[key] = transformer.normlize(desc_map[key])
print(np.correlate(desc_map[key],price_map['price']))
# for key in desc_map.keys():
# desc_map[key] = transformer.normlize(desc_map[key])
# correlates.append(np.correlate(desc_map[key],price_map['price'])[0][1])
# print(correlates)
...@@ -3,11 +3,8 @@ import csv ...@@ -3,11 +3,8 @@ import csv
import numpy as np import numpy as np
import random import random
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import evaluation import evaluation
from description_map import value_map,fix_key,fix_miss,add_future from description_map import value_map,fix_key,fix_miss,add_future,fix_LotFrontage
# load description_txt # load description_txt
description_txt = [] description_txt = []
...@@ -52,13 +49,6 @@ for i in range(len(colon_indexs)-1): ...@@ -52,13 +49,6 @@ for i in range(len(colon_indexs)-1):
ori_map[key] = interspace-j-1 #change word to vector ori_map[key] = interspace-j-1 #change word to vector
Full_map[desc_key]=ori_map Full_map[desc_key]=ori_map
# def normlize(npdata,justprice = False):
# _mean = np.mean(npdata)
# _std = np.std(npdata)
# if justprice:
# _mean = 180921.195
# _std = 79415.2918
# return (npdata-_mean)/_std
def normlize(npdata,justprice = False): def normlize(npdata,justprice = False):
_min = np.min(npdata) _min = np.min(npdata)
...@@ -68,8 +58,6 @@ def normlize(npdata,justprice = False): ...@@ -68,8 +58,6 @@ def normlize(npdata,justprice = False):
_max = 755000.0 _max = 755000.0
return (npdata-_min)/(_max-_min) return (npdata-_min)/(_max-_min)
# def convert2price(tensor):
# return tensor*79415.2918+180921.195
def convert2price(tensor): def convert2price(tensor):
return tensor*(755000.0-34900.0)+34900 return tensor*(755000.0-34900.0)+34900
...@@ -163,28 +151,32 @@ def dict2numpy(dict_data): ...@@ -163,28 +151,32 @@ def dict2numpy(dict_data):
return np_data return np_data
def load_all(dimension): def load_all(dimension):
desc_map,price_map = load_train()
desc_map = add_future(desc_map)
# print(len(desc_map))
# print(desc_map)
# print(desc_map)
train_price = np.array(price_map['price'])
train_desc = dict2numpy(desc_map)
desc_map = load_test() train_desc_map,train_price_map = load_train()
test_desc_map = load_test()
desc_map = {}
train_length = len(list(train_desc_map.values())[0])
for key in train_desc_map.keys():
desc_map[key] = np.concatenate((train_desc_map[key],test_desc_map[key]),axis=0)
# desc_map[key] = normlize(desc_map[key])
desc_map['LotFrontage'] = fix_LotFrontage(desc_map)
desc_map['YearBuilt'] = (desc_map['YearBuilt']-1800)/10
desc_map['YearRemodAdd'] = (desc_map['YearRemodAdd']-1800)/10
desc_map = add_future(desc_map) desc_map = add_future(desc_map)
test_desc = dict2numpy(desc_map)
desc_all = np.concatenate((train_desc,test_desc),axis=0) for key in desc_map.keys():
for i in range(len(desc_all[0])): desc_map[key] = normlize(desc_map[key])
desc_all[:,i] = normlize(desc_all[:,i])
# print(desc_all) desc_all = dict2numpy(desc_map)
pca=PCA(n_components=dimension) #加载PCA算法,设置降维后主成分数目为 pca=PCA(n_components=dimension) #加载PCA算法,设置降维后主成分数目为
desc_all=pca.fit_transform(desc_all)#对样本进行降维 desc_all=pca.fit_transform(desc_all)#对样本进行降维
train_price = normlize(train_price,True) train_price = normlize(np.array(train_price_map['price']),True)
train_desc = desc_all[:len(train_desc)] train_desc = desc_all[:train_length]
test_desc = desc_all[len(train_desc):] test_desc = desc_all[train_length:]
return train_desc.astype(np.float32),train_price.astype(np.float32),test_desc.astype(np.float32) return train_desc.astype(np.float32),train_price.astype(np.float32),test_desc.astype(np.float32)
...@@ -197,30 +189,11 @@ def write_csv(prices,path): ...@@ -197,30 +189,11 @@ def write_csv(prices,path):
csvFile.close() csvFile.close()
def main(): def main():
load_all(80)
dimension = 80 # dimension = 80
# train_desc,train_price,test_desc = load_all(dimension)
train_desc,train_price,test_desc = load_all(dimension)
# # KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# kr = GridSearchCV(KernelRidge(kernel='polynomial', gamma=0.1),
# param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
# "gamma": np.logspace(-2, 2, 5)})
# kr.fit(train_desc, train_price)
# y_kr = kr.predict(test_desc)
# for i in range(len(y_kr)):
# y_kr[i] = convert2price(y_kr[i])
# # print(y_kr.shape)
# print(dimension,evaluation.eval_test(y_kr))
# write_csv(train_price, './result.csv')
# # print(data)
# plt.plot(data[1])
# plt.show()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
import numpy as np import numpy as np
import pandas as pd
value_map = {} value_map = {}
value_map["MSSubClass"] = {'180':1, value_map["MSSubClass"] = {'180':1,
'30':2, '45':2, '30':2, '45':2,
...@@ -110,10 +112,18 @@ def fix_miss(name): ...@@ -110,10 +112,18 @@ def fix_miss(name):
else: else:
return 0 return 0
# def fix_LotFrontage(Full_map): def fix_LotFrontage(Full_map):
# a = np.zeros(25) data_df = pd.DataFrame(Full_map)
# for i in range(25): data_df["LotFrontage"] = data_df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
# a[Full_map['Neighborhood'][i]-1] += return data_df["LotFrontage"].to_numpy()
def binary(npdata):
for i in range(len(npdata)):
if npdata[i]>0:
npdata[i] = 1
else:
npdata[i] = 0
return npdata
def add_future(features): def add_future(features):
features["TotalHouse"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] features["TotalHouse"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"]
...@@ -141,7 +151,9 @@ def add_future(features): ...@@ -141,7 +151,9 @@ def add_future(features):
features["Rooms"] = features["FullBath"]+features["TotRmsAbvGrd"] features["Rooms"] = features["FullBath"]+features["TotRmsAbvGrd"]
features["PorchArea"] = features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"] features["PorchArea"] = features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features["TotalPlace"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] + features["GarageArea"] + features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"] features["TotalPlace"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] + features["GarageArea"] + features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features['all_quality'] = (features['ExterQual'] +features['BsmtFinType1']+features['BsmtFinType2']+
features['KitchenQual']+features['FireplaceQu']+features['GarageQual']+
features['PoolQC']+features['Fence'])
features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd'] features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF'] features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
...@@ -153,11 +165,15 @@ def add_future(features): ...@@ -153,11 +165,15 @@ def add_future(features):
features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] + features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
features['EnclosedPorch'] + features['ScreenPorch'] + features['EnclosedPorch'] + features['ScreenPorch'] +
features['WoodDeckSF']) features['WoodDeckSF'])
# features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
# features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
# features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
#random features
random_list = ['GrLivArea','OverallQual','2ndFlrSF','YearBuilt','1stFlrSF','TotalBsmtSF','OverallCond',
'my_Neighborhood','my_SaleCondition','BsmtFinSF1','my_MSZoning','LotArea','GarageCars','YearRemodAdd','GarageArea']
length = len(random_list)
for i in range(length):
for j in range(i,length):
if i != j:
features[random_list[i]+'*'+random_list[j]]=features[random_list[i]]*features[random_list[j]]
return features return features
\ No newline at end of file
...@@ -32,7 +32,7 @@ def RMSE(records_real,records_predict): ...@@ -32,7 +32,7 @@ def RMSE(records_real,records_predict):
def main(): def main():
# my_price = load_submission('./datasets/sample_submission.csv') # my_price = load_submission('./datasets/sample_submission.csv')
my_price = load_submission('./result/0.03688_0.14435.csv') my_price = load_submission('./result/keras_untuned.csv')
print(eval_test(my_price)) print(eval_test(my_price))
if __name__ == '__main__': if __name__ == '__main__':
......
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC,LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline from sklearn.pipeline import make_pipeline
...@@ -7,29 +7,104 @@ from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone ...@@ -7,29 +7,104 @@ from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
# import xgboost as xgb from sklearn.svm import SVR
# import lightgbm as lgb import xgboost as xgb
import lightgbm as lgb
import numpy as np import numpy as np
import torch
import dataloader import dataloader
import evaluation import evaluation
import time import time
import transformer import transformer
import time
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
def eval(model,train_x,train_y,test_x):
model.fit(train_x, train_y)
y_pre = model.predict(test_x)
for i in range(len(y_pre)):
y_pre[i] = dataloader.convert2price(y_pre[i])
return evaluation.eval_test(y_pre),y_pre
# print(dimension,evaluation.eval_test(y_pre))
# KernelRidge()
krr = GridSearchCV(KernelRidge(kernel='polynomial'),cv = 3,
param_grid={"alpha": np.logspace(-1, 2, 10),
"gamma": np.logspace(-1, 2, 10)})
dimension = 85 las = LassoCV(alphas=np.logspace(-5, 2, 50),eps=np.logspace(-5, 2, 20),max_iter=10000)
train_desc,train_price,test_desc = dataloader.load_all(dimension)
kr = GridSearchCV(KernelRidge(kernel='polynomial'), model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
# ElasticNet
ENet = GridSearchCV(ElasticNet(max_iter = 10000),
param_grid={"alpha": np.logspace(-3, 2, 6), param_grid={"alpha": np.logspace(-3, 2, 6),
"gamma": np.logspace(-2, 2, 5)}) "l1_ratio": np.logspace(-2, 2, 5)})
# print(np.logspace(-2, 2, 5)) #BayesianRidge
kr.fit(train_desc, train_price) bay = BayesianRidge()
y_kr = kr.predict(test_desc)
for i in range(len(y_kr)): #GradientBoostingRegressor
y_kr[i] = dataloader.convert2price(y_kr[i]) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01,
# print(y_kr.shape) max_depth=4, max_features='sqrt',
print(dimension,evaluation.eval_test(y_kr)) min_samples_leaf=15, min_samples_split=10,
dataloader.write_csv(y_kr, './result/result.csv') loss='huber', random_state =5)
\ No newline at end of file
#LGBMRegressor
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
learning_rate=0.05, n_estimators=720,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 5, feature_fraction = 0.2319,
feature_fraction_seed=9, bagging_seed=9,
min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
#SVR
model_svr = GridSearchCV(SVR(kernel="rbf"),
param_grid={"C": np.logspace(0, 2, 5),
"gamma": np.logspace(-4, -3, 8),
"epsilon":np.logspace(-4, -3, 5)})
models = [krr,las,model_xgb,ENet,bay,GBoost,model_lgb,model_svr]
model_names = ['krr','las','model_xgb','ENet','bay','GBoost','model_lgb','model_svr']
for model,model_name in zip(models,model_names):
print(model_name)
losss = []
start_dimension = 60
end_dimension = 150
for dimension in range(start_dimension,end_dimension):
t1 = time.time()
train_desc,train_price,test_desc = dataloader.load_all(dimension)
loss,_ = eval(model,train_desc,train_price,test_desc)
losss.append(loss)
t2 = time.time()
print(dimension,loss,' cost time:','%.3f'%(t2-t1),'s')
t1 = time.time()
best_dimension = losss.index(min(losss))+start_dimension
print('Best:',min(losss),' dimension:',best_dimension)
train_desc,train_price,test_desc = dataloader.load_all(best_dimension)
loss,pre = eval(model,train_desc,train_price,test_desc)
dataloader.write_csv(pre, './result/best_'+'%.6f'%loss+'_'+model_name+'.csv')
plt.plot(np.linspace(start_dimension,dimension,dimension-start_dimension+1),losss)
plt.xlabel('PCA dimension')
plt.ylabel('loss')
plt.title(model_name+' :loss_PCA')
plt.savefig('./images/'+'%.6f'%loss+'_'+str(best_dimension)+'_'+model_name+".png")
plt.cla()
# plt.show()
...@@ -10,7 +10,7 @@ class Linear(nn.Module): ...@@ -10,7 +10,7 @@ class Linear(nn.Module):
def __init__(self, n_feature, n_hidden, n_output): def __init__(self, n_feature, n_hidden, n_output):
super(Linear, self).__init__() super(Linear, self).__init__()
self.fc1 = torch.nn.Linear(n_feature, n_hidden) # hidden layer self.fc1 = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.relu = nn.ReLU(inplace=True) self.relu = nn.Sigmoid()
self.dropout = nn.Dropout(0.2) self.dropout = nn.Dropout(0.2)
self.fc2 = torch.nn.Linear(n_hidden, n_output) # output layer self.fc2 = torch.nn.Linear(n_hidden, n_output) # output layer
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -8,13 +8,13 @@ import time ...@@ -8,13 +8,13 @@ import time
import transformer import transformer
#parameter #parameter
LR = 0.0001 LR = 0.001
EPOCHS = 1000 EPOCHS = 1000
BATCHSIZE = 64 BATCHSIZE = 64
CONTINUE = False CONTINUE = False
use_gpu = True use_gpu = True
SAVE_FRE = 5 SAVE_FRE = 5
Dimension = 120 Dimension = 128
#load data #load data
train_desc,train_price,test_desc = dataloader.load_all(Dimension) train_desc,train_price,test_desc = dataloader.load_all(Dimension)
train_desc.tolist() train_desc.tolist()
......
...@@ -8,4 +8,17 @@ def match_random(a,b): ...@@ -8,4 +8,17 @@ def match_random(a,b):
np.random.shuffle(b) np.random.shuffle(b)
def random_transform(a,alpha): def random_transform(a,alpha):
return a*random.uniform(1-alpha,1+alpha) return a*random.uniform(1-alpha,1+alpha)
\ No newline at end of file
def normlize(npdata,justprice = False):
_min = np.min(npdata)
_max = np.max(npdata)
if justprice:
_min = 34900.0
_max = 755000.0
return (npdata-_min)/(_max-_min)
def convert2price(tensor):
return tensor*(755000.0-34900.0)+34900
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册