提交 f74d3a2d 编写于 作者: J jrzaurin

temporal files used to check that modules run

上级 f349b6d5
......@@ -3,14 +3,11 @@ import pandas as pd
import torch
from pathlib import Path
# from pytorch_widedeep.utils.data_utils import prepare_data
# from pytorch_widedeep.models.wide_deep import WideDeep
from pytorch_widedeep.utils.wide_utils import WideProcessor
from pytorch_widedeep.utils.deep_utils import DeepProcessor
# from pytorch_widedeep.initializers import Normal, Uniform, XavierNormal, XavierUniform
# from pytorch_widedeep.lr_schedulers import MultipleLRScheduler, StepLR, MultiStepLR, ReduceLROnPlateau
# from pytorch_widedeep.optimizers import Adam, SGD, RAdam
# from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
# from pytorch_widedeep.metrics import BinaryAccuracy
from pytorch_widedeep.models.wide import Wide
from pytorch_widedeep.models.deep_dense import DeepDense
# use_cuda = torch.cuda.is_available()
......@@ -26,21 +23,34 @@ if __name__ == '__main__':
df.drop('income', axis=1, inplace=True)
df.head()
from pytorch_widedeep.utils.wide_utils import WideProcessor
wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
'native_country','gender']
crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
prepare_wide = WideProcessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = prepare_wide.fit_transform(df)
from pytorch_widedeep.utils.deep_utils import DeepProcessor
cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
('occupation',10),('native_country',10)]
continuous_cols = ["age","hours_per_week"]
prepare_wide = WideProcessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = prepare_wide.fit_transform(df)
prepare_deep = DeepProcessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
X_deep = prepare_deep.fit_transform(df)
wide = Wide(X_wide.shape[1], 1)
pred_wide = wide(torch.tensor(X_wide[:10]))
deep = DeepDense(
hidden_layers=[32,16],
dropout=[0.5],
deep_column_idx=prepare_deep.deep_column_idx,
embed_input=prepare_deep.embeddings_input,
continuous_cols=continuous_cols,
batchnorm=True,
output_dim=1)
pred_deep = deep(torch.tensor(X_deep[:10]))
pdb.set_trace()
# wd_dataset = prepare_data(df,
# target=target,
# wide_cols=wide_cols,
......
......@@ -4,88 +4,73 @@ import pickle
import numpy as np
import pandas as pd
# from torchvision.transforms import ToTensor, Normalize
# from pytorch_widedeep.initializers import Normal, Uniform, XavierNormal, XavierUniform
# from pytorch_widedeep.lr_schedulers import MultipleLRScheduler, StepLR, MultiStepLR
# from pytorch_widedeep.optimizers import MultipleOptimizers, Adam, SGD, RAdam
# from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
# from pytorch_widedeep.metrics import BinaryAccuracy
# from pytorch_widedeep.utils.data_utils import prepare_data
# from pytorch_widedeep.models.wide_deep import WideDeep, WideDeepLoader
from pytorch_widedeep.utils.text_utils import TextProcessor
from pytorch_widedeep.models.deep_text import DeepText
import pdb
from pytorch_widedeep.utils.image_utils import ImageProcessor
from pytorch_widedeep.models.deep_image import DeepImage
from pytorch_widedeep.utils.base_util import DataProcessor
import pdb
use_cuda = torch.cuda.is_available()
if __name__ == '__main__':
filepath = 'data/wd_dataset_airbnb.p'
# if os.path.isfile(filepath):
# wd = pickle.load(open(filepath, "rb"))
# else:
# df = pd.read_csv('../data/airbnb/tmp_df.csv')
# crossed_cols = (['property_type', 'room_type'],)
# already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
# wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
# 'instant_bookable'] + already_dummies
# cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
# [('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
# continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
# already_standard = ['latitude', 'longitude']
# text_col = 'description'
# word_vectors_path = 'data/glove.6B/glove.6B.100d.txt'
# img_col = 'id'
# img_path = 'data/airbnb/property_picture'
# target = 'yield'
# wd = prepare_data(df, target, wide_cols, crossed_cols, cat_embed_cols,
# continuous_cols, already_dummies, already_standard, text_col=text_col,
# word_vectors_path=word_vectors_path, img_col=img_col,
# img_path=img_path, filepath='data/wd_dataset_airbnb.p')
# from pytorch_widedeep.utils.text_utils import TextProcessor
# df = pd.read_csv('../data/airbnb/tmp_df.csv')
# text_col = 'description'
# word_vectors_path = '../data/glove.6B/glove.6B.100d.txt'
# text_processor = TextProcessor(word_vectors_path=word_vectors_path)
# X_text = text_processor.fit_transform(df, text_col)
# new_X = text_processor.transform(df.iloc[:10, :], text_col)
# pdb.set_trace()
df = pd.read_csv('../data/airbnb/tmp_df.csv')
from pytorch_widedeep.utils.image_utils import ImageProcessor
crossed_cols = (['property_type', 'room_type'],)
already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
'instant_bookable'] + already_dummies
cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
[('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
already_standard = ['latitude', 'longitude']
text_col = 'description'
word_vectors_path = '../data/glove.6B/glove.6B.100d.txt'
img_col = 'id'
img_path = '../data/airbnb/property_picture'
df = pd.read_csv('../data/airbnb/tmp_df.csv')
target = 'yield'
text_processor = TextProcessor(word_vectors_path=word_vectors_path)
X_text = text_processor.fit_transform(df, text_col)
deeptext = DeepText(
vocab_size=len(text_processor.vocab.itos),
hidden_dim=64,
n_layers=3,
rnn_dropout=0.5,
spatial_dropout=0.5,
padding_idx=1,
output_dim=1,
embedding_matrix=text_processor.embedding_matrix
)
image_processor = ImageProcessor()
X_images = image_processor.fit_transform(df, img_col, img_path)
new_X = image_processor.transform(df.iloc[:10,:], img_col, img_path)
pdb.set_trace()
model = WideDeep(output_dim=1, wide_dim=wd.wide.shape[1],
cat_embed_input = wd.cat_embed_input,
cat_embed_encoding_dict=wd.cat_embed_encoding_dict,
continuous_cols=wd.continuous_cols,
deep_column_idx=wd.deep_column_idx, add_text=True,
vocab_size=len(wd.vocab.itos),
word_embed_matrix = wd.word_embed_matrix,
add_image=True)
initializers = {'wide': Normal, 'deepdense':Normal, 'deeptext':Normal, 'deepimage':Normal}
optimizers = {'wide': Adam, 'deepdense':Adam, 'deeptext':RAdam, 'deepimage':Adam}
schedulers = {'wide': StepLR(step_size=5), 'deepdense':StepLR(step_size=5), 'deeptext':MultiStepLR(milestones=[5,8]),
'deepimage':MultiStepLR(milestones=[5,8])}
mean = [0.406, 0.456, 0.485] #BGR
std = [0.225, 0.224, 0.229] #BGR
transforms = [ToTensor, Normalize(mean=mean, std=std)]
callbacks = [EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out.pt')]
model.compile(method='regression', initializers=initializers, optimizers=optimizers,
lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
model.fit(X_wide=wd.wide, X_deep=wd.deepdense, X_text=wd.deeptext, X_img=wd.deepimage,
target=wd.target, n_epochs=1, batch_size=32, val_split=0.2)
\ No newline at end of file
deepimage = DeepImage()
# model = WideDeep(output_dim=1, wide_dim=wd.wide.shape[1],
# cat_embed_input = wd.cat_embed_input,
# cat_embed_encoding_dict=wd.cat_embed_encoding_dict,
# continuous_cols=wd.continuous_cols,
# deep_column_idx=wd.deep_column_idx, add_text=True,
# vocab_size=len(wd.vocab.itos),
# word_embed_matrix = wd.word_embed_matrix,
# add_image=True)
# initializers = {'wide': Normal, 'deepdense':Normal, 'deeptext':Normal, 'deepimage':Normal}
# optimizers = {'wide': Adam, 'deepdense':Adam, 'deeptext':RAdam, 'deepimage':Adam}
# schedulers = {'wide': StepLR(step_size=5), 'deepdense':StepLR(step_size=5), 'deeptext':MultiStepLR(milestones=[5,8]),
# 'deepimage':MultiStepLR(milestones=[5,8])}
# mean = [0.406, 0.456, 0.485] #BGR
# std = [0.225, 0.224, 0.229] #BGR
# transforms = [ToTensor, Normalize(mean=mean, std=std)]
# callbacks = [EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out.pt')]
# model.compile(method='regression', initializers=initializers, optimizers=optimizers,
# lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
# model.fit(X_wide=wd.wide, X_deep=wd.deepdense, X_text=wd.deeptext, X_img=wd.deepimage,
# target=wd.target, n_epochs=1, batch_size=32, val_split=0.2)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册