提交 f74d3a2d 编写于 作者: J jrzaurin

temporal files used to check that modules run

上级 f349b6d5
...@@ -3,14 +3,11 @@ import pandas as pd ...@@ -3,14 +3,11 @@ import pandas as pd
import torch import torch
from pathlib import Path from pathlib import Path
# from pytorch_widedeep.utils.data_utils import prepare_data from pytorch_widedeep.utils.wide_utils import WideProcessor
# from pytorch_widedeep.models.wide_deep import WideDeep from pytorch_widedeep.utils.deep_utils import DeepProcessor
# from pytorch_widedeep.initializers import Normal, Uniform, XavierNormal, XavierUniform from pytorch_widedeep.models.wide import Wide
# from pytorch_widedeep.lr_schedulers import MultipleLRScheduler, StepLR, MultiStepLR, ReduceLROnPlateau from pytorch_widedeep.models.deep_dense import DeepDense
# from pytorch_widedeep.optimizers import Adam, SGD, RAdam
# from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
# from pytorch_widedeep.metrics import BinaryAccuracy
# use_cuda = torch.cuda.is_available() # use_cuda = torch.cuda.is_available()
...@@ -26,21 +23,34 @@ if __name__ == '__main__': ...@@ -26,21 +23,34 @@ if __name__ == '__main__':
df.drop('income', axis=1, inplace=True) df.drop('income', axis=1, inplace=True)
df.head() df.head()
from pytorch_widedeep.utils.wide_utils import WideProcessor
wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation', wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
'native_country','gender'] 'native_country','gender']
crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')] crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
prepare_wide = WideProcessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = prepare_wide.fit_transform(df)
from pytorch_widedeep.utils.deep_utils import DeepProcessor
cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10), cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
('occupation',10),('native_country',10)] ('occupation',10),('native_country',10)]
continuous_cols = ["age","hours_per_week"] continuous_cols = ["age","hours_per_week"]
prepare_wide = WideProcessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = prepare_wide.fit_transform(df)
prepare_deep = DeepProcessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols) prepare_deep = DeepProcessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
X_deep = prepare_deep.fit_transform(df) X_deep = prepare_deep.fit_transform(df)
wide = Wide(X_wide.shape[1], 1)
pred_wide = wide(torch.tensor(X_wide[:10]))
deep = DeepDense(
hidden_layers=[32,16],
dropout=[0.5],
deep_column_idx=prepare_deep.deep_column_idx,
embed_input=prepare_deep.embeddings_input,
continuous_cols=continuous_cols,
batchnorm=True,
output_dim=1)
pred_deep = deep(torch.tensor(X_deep[:10]))
pdb.set_trace()
# wd_dataset = prepare_data(df, # wd_dataset = prepare_data(df,
# target=target, # target=target,
# wide_cols=wide_cols, # wide_cols=wide_cols,
......
...@@ -4,88 +4,73 @@ import pickle ...@@ -4,88 +4,73 @@ import pickle
import numpy as np import numpy as np
import pandas as pd import pandas as pd
# from torchvision.transforms import ToTensor, Normalize from pytorch_widedeep.utils.text_utils import TextProcessor
# from pytorch_widedeep.initializers import Normal, Uniform, XavierNormal, XavierUniform from pytorch_widedeep.models.deep_text import DeepText
# from pytorch_widedeep.lr_schedulers import MultipleLRScheduler, StepLR, MultiStepLR
# from pytorch_widedeep.optimizers import MultipleOptimizers, Adam, SGD, RAdam
# from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
# from pytorch_widedeep.metrics import BinaryAccuracy
# from pytorch_widedeep.utils.data_utils import prepare_data
# from pytorch_widedeep.models.wide_deep import WideDeep, WideDeepLoader
import pdb from pytorch_widedeep.utils.image_utils import ImageProcessor
from pytorch_widedeep.models.deep_image import DeepImage
from pytorch_widedeep.utils.base_util import DataProcessor import pdb
use_cuda = torch.cuda.is_available() use_cuda = torch.cuda.is_available()
if __name__ == '__main__': if __name__ == '__main__':
filepath = 'data/wd_dataset_airbnb.p'
# if os.path.isfile(filepath):
# wd = pickle.load(open(filepath, "rb"))
# else:
# df = pd.read_csv('../data/airbnb/tmp_df.csv')
# crossed_cols = (['property_type', 'room_type'],) df = pd.read_csv('../data/airbnb/tmp_df.csv')
# already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
# wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
# 'instant_bookable'] + already_dummies
# cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
# [('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
# continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
# already_standard = ['latitude', 'longitude']
# text_col = 'description'
# word_vectors_path = 'data/glove.6B/glove.6B.100d.txt'
# img_col = 'id'
# img_path = 'data/airbnb/property_picture'
# target = 'yield'
# wd = prepare_data(df, target, wide_cols, crossed_cols, cat_embed_cols,
# continuous_cols, already_dummies, already_standard, text_col=text_col,
# word_vectors_path=word_vectors_path, img_col=img_col,
# img_path=img_path, filepath='data/wd_dataset_airbnb.p')
# from pytorch_widedeep.utils.text_utils import TextProcessor
# df = pd.read_csv('../data/airbnb/tmp_df.csv')
# text_col = 'description'
# word_vectors_path = '../data/glove.6B/glove.6B.100d.txt'
# text_processor = TextProcessor(word_vectors_path=word_vectors_path)
# X_text = text_processor.fit_transform(df, text_col)
# new_X = text_processor.transform(df.iloc[:10, :], text_col)
# pdb.set_trace()
from pytorch_widedeep.utils.image_utils import ImageProcessor crossed_cols = (['property_type', 'room_type'],)
already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
'instant_bookable'] + already_dummies
cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
[('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
already_standard = ['latitude', 'longitude']
text_col = 'description'
word_vectors_path = '../data/glove.6B/glove.6B.100d.txt'
img_col = 'id' img_col = 'id'
img_path = '../data/airbnb/property_picture' img_path = '../data/airbnb/property_picture'
df = pd.read_csv('../data/airbnb/tmp_df.csv') target = 'yield'
text_processor = TextProcessor(word_vectors_path=word_vectors_path)
X_text = text_processor.fit_transform(df, text_col)
deeptext = DeepText(
vocab_size=len(text_processor.vocab.itos),
hidden_dim=64,
n_layers=3,
rnn_dropout=0.5,
spatial_dropout=0.5,
padding_idx=1,
output_dim=1,
embedding_matrix=text_processor.embedding_matrix
)
image_processor = ImageProcessor() image_processor = ImageProcessor()
X_images = image_processor.fit_transform(df, img_col, img_path) X_images = image_processor.fit_transform(df, img_col, img_path)
new_X = image_processor.transform(df.iloc[:10,:], img_col, img_path) deepimage = DeepImage()
pdb.set_trace()
# model = WideDeep(output_dim=1, wide_dim=wd.wide.shape[1],
model = WideDeep(output_dim=1, wide_dim=wd.wide.shape[1], # cat_embed_input = wd.cat_embed_input,
cat_embed_input = wd.cat_embed_input, # cat_embed_encoding_dict=wd.cat_embed_encoding_dict,
cat_embed_encoding_dict=wd.cat_embed_encoding_dict, # continuous_cols=wd.continuous_cols,
continuous_cols=wd.continuous_cols, # deep_column_idx=wd.deep_column_idx, add_text=True,
deep_column_idx=wd.deep_column_idx, add_text=True, # vocab_size=len(wd.vocab.itos),
vocab_size=len(wd.vocab.itos), # word_embed_matrix = wd.word_embed_matrix,
word_embed_matrix = wd.word_embed_matrix, # add_image=True)
add_image=True)
# initializers = {'wide': Normal, 'deepdense':Normal, 'deeptext':Normal, 'deepimage':Normal}
initializers = {'wide': Normal, 'deepdense':Normal, 'deeptext':Normal, 'deepimage':Normal} # optimizers = {'wide': Adam, 'deepdense':Adam, 'deeptext':RAdam, 'deepimage':Adam}
optimizers = {'wide': Adam, 'deepdense':Adam, 'deeptext':RAdam, 'deepimage':Adam} # schedulers = {'wide': StepLR(step_size=5), 'deepdense':StepLR(step_size=5), 'deeptext':MultiStepLR(milestones=[5,8]),
schedulers = {'wide': StepLR(step_size=5), 'deepdense':StepLR(step_size=5), 'deeptext':MultiStepLR(milestones=[5,8]), # 'deepimage':MultiStepLR(milestones=[5,8])}
'deepimage':MultiStepLR(milestones=[5,8])} # mean = [0.406, 0.456, 0.485] #BGR
mean = [0.406, 0.456, 0.485] #BGR # std = [0.225, 0.224, 0.229] #BGR
std = [0.225, 0.224, 0.229] #BGR # transforms = [ToTensor, Normalize(mean=mean, std=std)]
transforms = [ToTensor, Normalize(mean=mean, std=std)] # callbacks = [EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out.pt')]
callbacks = [EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out.pt')]
# model.compile(method='regression', initializers=initializers, optimizers=optimizers,
model.compile(method='regression', initializers=initializers, optimizers=optimizers, # lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
# model.fit(X_wide=wd.wide, X_deep=wd.deepdense, X_text=wd.deeptext, X_img=wd.deepimage,
model.fit(X_wide=wd.wide, X_deep=wd.deepdense, X_text=wd.deeptext, X_img=wd.deepimage, # target=wd.target, n_epochs=1, batch_size=32, val_split=0.2)
target=wd.target, n_epochs=1, batch_size=32, val_split=0.2) \ No newline at end of file
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册