From 04d40076741a9a9dfe2a4967bba28fa0dd6a878e Mon Sep 17 00:00:00 2001 From: Pavol Mulinka Date: Sun, 17 Oct 2021 19:09:20 +0200 Subject: [PATCH] =?UTF-8?q?code=20cleaned=20+=20added=20workaround=20for?= =?UTF-8?q?=20=C2=A8ImportError:=20libGL.so.1:=20cannot=20open=20shared=20?= =?UTF-8?q?object=20file:=20No=20such=20file=20or=20directory=20```=20apt-?= =?UTF-8?q?get=20update=20apt-get=20install=20ffmpeg=20libsm6=20libxext6?= =?UTF-8?q?=20=20-y=20```=C2=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build.yml | 2 + .../12_HyperParameter_tuning_w_RayTune.ipynb | 98 +++++++------------ pytorch_widedeep/callbacks.py | 49 ++-------- pytorch_widedeep/tab2vec.py | 48 ++++++--- pytorch_widedeep/training/trainer.py | 2 +- setup.py | 2 +- 6 files changed, 83 insertions(+), 118 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 480655f..5770bfc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -44,6 +44,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + apt-get update + apt-get install ffmpeg libsm6 libxext6 -y python -m pip install --upgrade pip python -m pip install pytest-cov codecov . if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/examples/12_HyperParameter_tuning_w_RayTune.ipynb b/examples/12_HyperParameter_tuning_w_RayTune.ipynb index 5db289f..d638b72 100644 --- a/examples/12_HyperParameter_tuning_w_RayTune.ipynb +++ b/examples/12_HyperParameter_tuning_w_RayTune.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -631,7 +631,7 @@ "4 0.68 -0.59 2.0 -36.0 -6.9 2.02 0.14 -0.23 " ] }, - "execution_count": 6, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -644,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -655,7 +655,7 @@ "Name: target, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -667,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -677,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -694,7 +694,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -703,7 +703,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -729,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -740,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -787,7 +787,7 @@ ")" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -802,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -815,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -828,19 +828,19 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "== Status ==
Memory usage on this node: 2.1/12.2 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/6.69 GiB heap, 0.0/2.29 GiB objects
Result logdir: /home/palo/ray_results/_inner_2021-10-15_01-12-33
Number of trials: 2/2 (2 TERMINATED)
\n", + "== Status ==
Memory usage on this node: 3.8/12.2 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/5.86 GiB heap, 0.0/2.0 GiB objects
Result logdir: /home/palo/ray_results/_inner_2021-10-17_19-06-33
Number of trials: 2/2 (2 TERMINATED)
\n", "\n", "\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", "\n", "
Trial name status loc batch_size iter total time (s)
_inner_367dc_00000TERMINATED 1000 5 14.1115
_inner_367dc_00001TERMINATED 5000 5 13.1224
_inner_94413_00000TERMINATED 1000 5 16.337
_inner_94413_00001TERMINATED 5000 5 15.021


" ], @@ -855,7 +855,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-10-15 01:12:50,327\tINFO tune.py:448 -- Total run time: 16.66 seconds (16.61 seconds for the tuning loop).\n" + "2021-10-17 19:06:52,530\tINFO tune.py:448 -- Total run time: 21.26 seconds (19.21 seconds for the tuning loop).\n" ] } ], @@ -882,7 +882,7 @@ " initializers={'deeptabular': XavierNormal},\n", " optimizers={'deeptabular': deep_opt},\n", " metrics=[accuracy, precision, recall, f1],\n", - " verbose=0)\n", + " verbose=1)\n", "\n", " trainer.fit(X_train=X_train,\n", " X_val=X_val,\n", @@ -900,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -909,60 +909,38 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(pid=14987)\u001b[0m sys:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2021-10-17_19-06-31_273508_14906/logs/worker-edb8ddf5edcb134690e06916577fafd5d2bb26af-01000000-14987.out' mode='a' encoding='utf-8'>\n", + "\u001b[2m\u001b[36m(pid=14987)\u001b[0m sys:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2021-10-17_19-06-31_273508_14906/logs/worker-edb8ddf5edcb134690e06916577fafd5d2bb26af-01000000-14987.err' mode='a' encoding='utf-8'>\n" + ] + } + ], "source": [ "%load_ext tensorboard" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Reusing TensorBoard on port 6006 (pid 2541), started 7:22:22 ago. (Use '!kill 2541' to kill it.)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%tensorboard --logdir ~/ray_results" ] } ], "metadata": { + "interpreter": { + "hash": "3b99005fd577fa40f3cce433b2b92303885900e634b2b5344c07c59d06c8792d" + }, "kernelspec": { - "display_name": "Python 3", - "language": "python", + "display_name": "Python 3.8.5 64-bit ('base': conda)", "name": "python3" }, "language_info": { diff --git a/pytorch_widedeep/callbacks.py b/pytorch_widedeep/callbacks.py index 4932e47..0e43aa7 100644 --- a/pytorch_widedeep/callbacks.py +++ b/pytorch_widedeep/callbacks.py @@ -150,6 +150,7 @@ class History(Callback): This callback runs by default within :obj:`Trainer`, therefore, should not be passed to the :obj:`Trainer`. Is included here just for completion. """ + def on_train_begin(self, logs: Optional[Dict] = None): self.trainer.history = {} @@ -158,11 +159,11 @@ class History(Callback): ): logs = logs or {} for k, v in logs.items(): - if isinstance(v, np.ndarray):# or isinstance(v, list): + if isinstance(v, np.ndarray): v = v.tolist() - if isinstance(v, list) and len(v)>1: + if isinstance(v, list) and len(v) > 1: for i in range(len(v)): - self.trainer.history.setdefault(k+'_'+str(i), []).append(v[i]) + self.trainer.history.setdefault(k + "_" + str(i), []).append(v[i]) else: self.trainer.history.setdefault(k, []).append(v) @@ -264,10 +265,9 @@ class LRHistory(Callback): >>> trainer = Trainer(model, objective="regression", callbacks=[LRHistory(n_epochs=10)]) """ - def __init__(self, n_epochs: int, ray_tune: bool = False): + def __init__(self, n_epochs: int): super(LRHistory, self).__init__() self.n_epochs = n_epochs - self.ray_tune = ray_tune def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None): if epoch == 0 and self.trainer.lr_scheduler is not None: @@ -671,48 +671,15 @@ class EarlyStopping(Callback): class RayTuneReporter(Callback): r"""Callback that allows reporting history and lr_history values to RayTune for Hyperparameter tuning - - Parameters - ----------- - - Attributes - ---------- - - Examples - -------- - >>> from pytorch_widedeep.callbacks import RayTuneReporter - >>> from pytorch_widedeep.models import TabMlp, Wide, WideDeep - >>> from pytorch_widedeep.training import Trainer - >>> from ray import tune - >>> import tracemalloc - >>> tracemalloc.start() - >>> - >>> config={"batch_size": tune.grid_search([1000, 5000]),} - >>> embed_input = [(u, i, j) for u, i, j in zip(["a", "b", "c"][:4], [4] * 3, [8] * 3)] - >>> column_idx = {k: v for v, k in enumerate(["a", "b", "c"])} - >>> wide = Wide(10, 1) - >>> deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, embed_input=embed_input) - >>> model = WideDeep(wide, deep) - >>> - >>> def training_function(config, X_train, X_val): - >>> batch_size = config["batch_size"] - >>> trainer = Trainer(model, objective="regression", callbacks=[RayTuneReporter]) - >>> trainer.fit(X_train=X_train, - >>> X_val=X_val, - >>> n_epochs=5, - >>> batch_size=batch_size) - >>> X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train} - >>> X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid} - >>> analysis = tune.run(tune.with_parameters(training_function, X_train=X_train, X_val=X_val), - >>> config=config) """ + def on_epoch_end( self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None ): report_dict = {} for k, v in self.trainer.history.items(): report_dict.update({k: v[-1]}) - if hasattr(self.trainer, 'lr_history'): + if hasattr(self.trainer, "lr_history"): for k, v in self.trainer.lr_history.items(): report_dict.update({k: v[-1]}) - tune.report(report_dict) \ No newline at end of file + tune.report(report_dict) diff --git a/pytorch_widedeep/tab2vec.py b/pytorch_widedeep/tab2vec.py index 26da4de..8b7b53b 100644 --- a/pytorch_widedeep/tab2vec.py +++ b/pytorch_widedeep/tab2vec.py @@ -126,8 +126,12 @@ class Tab2Vec: """ return self - def transform(self, df: pd.DataFrame, new_embed_col_list: bool = False, - target_col: Optional[str] = None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.Series]]: + def transform( + self, + df: pd.DataFrame, + new_embed_col_list: bool = False, + target_col: Optional[str] = None, + ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.Series]]: r""" Parameters ---------- @@ -160,22 +164,36 @@ class Tab2Vec: col_names = list(self.tab_preprocessor.column_idx.keys()) embed_col_names = [] - for col, vec_size in tab_preprocessor.embed_cols: - embed_col_names_temp = [col+'_'+str(i) for i in range(vec_size)] - embed_col_names.extend(embed_col_names_temp) - col_names = list(chain.from_iterable(embed_col_names_temp if item == col - else [item] for item in col_names)) - - if target_col: - if new_embed_col_list: - return pd.DataFrame(data=X_vec, columns=col_names), df[target_col], embed_col_names + if self.tab_preprocessor.for_transformer: + if target_col: + return pd.DataFrame(data=X_vec), df[target_col] else: - return pd.DataFrame(data=X_vec, columns=col_names), df[target_col] + return pd.DataFrame(data=X_vec) else: - if new_embed_col_list: - return pd.DataFrame(data=X_vec, columns=col_names), embed_col_names + for col, vec_size in self.tab_preprocessor.embed_cols: + embed_col_names_temp = [col + "_" + str(i) for i in range(vec_size)] + embed_col_names.extend(embed_col_names_temp) + col_names = list( + chain.from_iterable( + embed_col_names_temp if item == col else [item] + for item in col_names + ) + ) + + if target_col: + if new_embed_col_list: + return ( + pd.DataFrame(data=X_vec, columns=col_names), + df[target_col], + embed_col_names, + ) + else: + return pd.DataFrame(data=X_vec, columns=col_names), df[target_col] else: - return pd.DataFrame(data=X_vec, columns=col_names) + if new_embed_col_list: + return pd.DataFrame(data=X_vec, columns=col_names), embed_col_names + else: + return pd.DataFrame(data=X_vec, columns=col_names) def fit_transform( self, df: pd.DataFrame, target_col: Optional[str] = None diff --git a/pytorch_widedeep/training/trainer.py b/pytorch_widedeep/training/trainer.py index 59b3d1c..ea38d76 100644 --- a/pytorch_widedeep/training/trainer.py +++ b/pytorch_widedeep/training/trainer.py @@ -1242,7 +1242,7 @@ class Trainer: else: self.metric = None sorted_callbacks_list = deepcopy(self.callbacks) - for obj,i in zip(self.callbacks[::-1], range(len(self.callbacks))[::-1]): + for obj, i in zip(self.callbacks[::-1], range(len(self.callbacks))[::-1]): if isinstance(obj, RayTuneReporter): sorted_callbacks_list.append(sorted_callbacks_list.pop(i)) self.callbacks = sorted_callbacks_list diff --git a/setup.py b/setup.py index 0df8b7d..2f72687 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ setup_kwargs = { "einops", "wrapt", "torchmetrics", - "ray", + "ray[tune]", ], "extras_require": extras, "python_requires": ">=3.7.0", -- GitLab