From 04d40076741a9a9dfe2a4967bba28fa0dd6a878e Mon Sep 17 00:00:00 2001
From: Pavol Mulinka <mulinka.pavol@gmail.com>
Date: Sun, 17 Oct 2021 19:09:20 +0200
Subject: [PATCH] =?UTF-8?q?code=20cleaned=20+=20added=20workaround=20for?=
 =?UTF-8?q?=20=C2=A8ImportError:=20libGL.so.1:=20cannot=20open=20shared=20?=
 =?UTF-8?q?object=20file:=20No=20such=20file=20or=20directory=20```=20apt-?=
 =?UTF-8?q?get=20update=20apt-get=20install=20ffmpeg=20libsm6=20libxext6?=
 =?UTF-8?q?=20=20-y=20```=C2=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/build.yml                   |  2 +
 .../12_HyperParameter_tuning_w_RayTune.ipynb  | 98 +++++++------------
 pytorch_widedeep/callbacks.py                 | 49 ++--------
 pytorch_widedeep/tab2vec.py                   | 48 ++++++---
 pytorch_widedeep/training/trainer.py          |  2 +-
 setup.py                                      |  2 +-
 6 files changed, 83 insertions(+), 118 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 480655f..5770bfc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -44,6 +44,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        apt-get update
+        apt-get install ffmpeg libsm6 libxext6  -y
         python -m pip install --upgrade pip
         python -m pip install pytest-cov codecov .
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/examples/12_HyperParameter_tuning_w_RayTune.ipynb b/examples/12_HyperParameter_tuning_w_RayTune.ipynb
index 5db289f..d638b72 100644
--- a/examples/12_HyperParameter_tuning_w_RayTune.ipynb
+++ b/examples/12_HyperParameter_tuning_w_RayTune.ipynb
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -72,7 +72,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -631,7 +631,7 @@
        "4  0.68 -0.59  2.0 -36.0   -6.9  2.02  0.14 -0.23  "
       ]
      },
-     "execution_count": 6,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -644,7 +644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -655,7 +655,7 @@
        "Name: target, dtype: int64"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -667,7 +667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -677,7 +677,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -694,7 +694,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -703,7 +703,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -729,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -740,7 +740,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -787,7 +787,7 @@
        ")"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -802,7 +802,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -815,7 +815,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -828,19 +828,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "== Status ==<br>Memory usage on this node: 2.1/12.2 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/6.69 GiB heap, 0.0/2.29 GiB objects<br>Result logdir: /home/palo/ray_results/_inner_2021-10-15_01-12-33<br>Number of trials: 2/2 (2 TERMINATED)<br><table>\n",
+       "== Status ==<br>Memory usage on this node: 3.8/12.2 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/5.86 GiB heap, 0.0/2.0 GiB objects<br>Result logdir: /home/palo/ray_results/_inner_2021-10-17_19-06-33<br>Number of trials: 2/2 (2 TERMINATED)<br><table>\n",
        "<thead>\n",
        "<tr><th>Trial name        </th><th>status    </th><th>loc  </th><th style=\"text-align: right;\">  batch_size</th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th></tr>\n",
        "</thead>\n",
        "<tbody>\n",
-       "<tr><td>_inner_367dc_00000</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">        1000</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">         14.1115</td></tr>\n",
-       "<tr><td>_inner_367dc_00001</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">        5000</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">         13.1224</td></tr>\n",
+       "<tr><td>_inner_94413_00000</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">        1000</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">          16.337</td></tr>\n",
+       "<tr><td>_inner_94413_00001</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">        5000</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">          15.021</td></tr>\n",
        "</tbody>\n",
        "</table><br><br>"
       ],
@@ -855,7 +855,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-10-15 01:12:50,327\tINFO tune.py:448 -- Total run time: 16.66 seconds (16.61 seconds for the tuning loop).\n"
+      "2021-10-17 19:06:52,530\tINFO tune.py:448 -- Total run time: 21.26 seconds (19.21 seconds for the tuning loop).\n"
      ]
     }
    ],
@@ -882,7 +882,7 @@
     "                  initializers={'deeptabular': XavierNormal},\n",
     "                  optimizers={'deeptabular': deep_opt},\n",
     "                  metrics=[accuracy, precision, recall, f1],\n",
-    "                  verbose=0)\n",
+    "                  verbose=1)\n",
     "\n",
     "    trainer.fit(X_train=X_train,\n",
     "                X_val=X_val,\n",
@@ -900,7 +900,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -909,60 +909,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(pid=14987)\u001b[0m sys:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2021-10-17_19-06-31_273508_14906/logs/worker-edb8ddf5edcb134690e06916577fafd5d2bb26af-01000000-14987.out' mode='a' encoding='utf-8'>\n",
+      "\u001b[2m\u001b[36m(pid=14987)\u001b[0m sys:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/ray/session_2021-10-17_19-06-31_273508_14906/logs/worker-edb8ddf5edcb134690e06916577fafd5d2bb26af-01000000-14987.err' mode='a' encoding='utf-8'>\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext tensorboard"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Reusing TensorBoard on port 6006 (pid 2541), started 7:22:22 ago. (Use '!kill 2541' to kill it.)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "      <iframe id=\"tensorboard-frame-de12d04f9fd792fb\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
-       "      </iframe>\n",
-       "      <script>\n",
-       "        (function() {\n",
-       "          const frame = document.getElementById(\"tensorboard-frame-de12d04f9fd792fb\");\n",
-       "          const url = new URL(\"/\", window.location);\n",
-       "          url.port = 6006;\n",
-       "          frame.src = url;\n",
-       "        })();\n",
-       "      </script>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "%tensorboard --logdir ~/ray_results"
    ]
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "3b99005fd577fa40f3cce433b2b92303885900e634b2b5344c07c59d06c8792d"
+  },
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
+   "display_name": "Python 3.8.5 64-bit ('base': conda)",
    "name": "python3"
   },
   "language_info": {
diff --git a/pytorch_widedeep/callbacks.py b/pytorch_widedeep/callbacks.py
index 4932e47..0e43aa7 100644
--- a/pytorch_widedeep/callbacks.py
+++ b/pytorch_widedeep/callbacks.py
@@ -150,6 +150,7 @@ class History(Callback):
     This callback runs by default within :obj:`Trainer`, therefore, should not
     be passed to the :obj:`Trainer`. Is included here just for completion.
     """
+
     def on_train_begin(self, logs: Optional[Dict] = None):
         self.trainer.history = {}
 
@@ -158,11 +159,11 @@ class History(Callback):
     ):
         logs = logs or {}
         for k, v in logs.items():
-            if isinstance(v, np.ndarray):# or isinstance(v, list):
+            if isinstance(v, np.ndarray):
                 v = v.tolist()
-            if isinstance(v, list) and len(v)>1:
+            if isinstance(v, list) and len(v) > 1:
                 for i in range(len(v)):
-                    self.trainer.history.setdefault(k+'_'+str(i), []).append(v[i])
+                    self.trainer.history.setdefault(k + "_" + str(i), []).append(v[i])
             else:
                 self.trainer.history.setdefault(k, []).append(v)
 
@@ -264,10 +265,9 @@ class LRHistory(Callback):
     >>> trainer = Trainer(model, objective="regression", callbacks=[LRHistory(n_epochs=10)])
     """
 
-    def __init__(self, n_epochs: int, ray_tune: bool = False):
+    def __init__(self, n_epochs: int):
         super(LRHistory, self).__init__()
         self.n_epochs = n_epochs
-        self.ray_tune = ray_tune
 
     def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
         if epoch == 0 and self.trainer.lr_scheduler is not None:
@@ -671,48 +671,15 @@ class EarlyStopping(Callback):
 
 class RayTuneReporter(Callback):
     r"""Callback that allows reporting history and lr_history values to RayTune for Hyperparameter tuning
-
-    Parameters
-    -----------
-
-    Attributes
-    ----------
-    
-    Examples
-    --------
-    >>> from pytorch_widedeep.callbacks import RayTuneReporter
-    >>> from pytorch_widedeep.models import TabMlp, Wide, WideDeep
-    >>> from pytorch_widedeep.training import Trainer
-    >>> from ray import tune
-    >>> import tracemalloc
-    >>> tracemalloc.start()
-    >>>
-    >>> config={"batch_size": tune.grid_search([1000, 5000]),}
-    >>> embed_input = [(u, i, j) for u, i, j in zip(["a", "b", "c"][:4], [4] * 3, [8] * 3)]
-    >>> column_idx = {k: v for v, k in enumerate(["a", "b", "c"])}
-    >>> wide = Wide(10, 1)
-    >>> deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, embed_input=embed_input)
-    >>> model = WideDeep(wide, deep)
-    >>>
-    >>> def training_function(config, X_train, X_val):
-    >>>    batch_size = config["batch_size"]
-    >>>    trainer = Trainer(model, objective="regression", callbacks=[RayTuneReporter])
-    >>>    trainer.fit(X_train=X_train,
-    >>>                X_val=X_val,
-    >>>                n_epochs=5,
-    >>>                batch_size=batch_size)
-    >>> X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train}
-    >>> X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid}
-    >>> analysis = tune.run(tune.with_parameters(training_function, X_train=X_train, X_val=X_val),
-    >>>                     config=config)
     """
+
     def on_epoch_end(
         self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None
     ):
         report_dict = {}
         for k, v in self.trainer.history.items():
             report_dict.update({k: v[-1]})
-        if hasattr(self.trainer, 'lr_history'):
+        if hasattr(self.trainer, "lr_history"):
             for k, v in self.trainer.lr_history.items():
                 report_dict.update({k: v[-1]})
-        tune.report(report_dict)
\ No newline at end of file
+        tune.report(report_dict)
diff --git a/pytorch_widedeep/tab2vec.py b/pytorch_widedeep/tab2vec.py
index 26da4de..8b7b53b 100644
--- a/pytorch_widedeep/tab2vec.py
+++ b/pytorch_widedeep/tab2vec.py
@@ -126,8 +126,12 @@ class Tab2Vec:
         """
         return self
 
-    def transform(self, df: pd.DataFrame, new_embed_col_list: bool = False,
-        target_col: Optional[str] = None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.Series]]:
+    def transform(
+        self,
+        df: pd.DataFrame,
+        new_embed_col_list: bool = False,
+        target_col: Optional[str] = None,
+    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.Series]]:
         r"""
         Parameters
         ----------
@@ -160,22 +164,36 @@ class Tab2Vec:
 
         col_names = list(self.tab_preprocessor.column_idx.keys())
         embed_col_names = []
-        for col, vec_size in tab_preprocessor.embed_cols:
-            embed_col_names_temp = [col+'_'+str(i) for i in range(vec_size)]
-            embed_col_names.extend(embed_col_names_temp)
-            col_names = list(chain.from_iterable(embed_col_names_temp if item == col
-                                                 else [item] for item in col_names))
-
-        if target_col:
-            if new_embed_col_list:
-                return pd.DataFrame(data=X_vec, columns=col_names), df[target_col], embed_col_names
+        if self.tab_preprocessor.for_transformer:
+            if target_col:
+                return pd.DataFrame(data=X_vec), df[target_col]
             else:
-                return pd.DataFrame(data=X_vec, columns=col_names), df[target_col]
+                return pd.DataFrame(data=X_vec)
         else:
-            if new_embed_col_list:
-                return pd.DataFrame(data=X_vec, columns=col_names), embed_col_names
+            for col, vec_size in self.tab_preprocessor.embed_cols:
+                embed_col_names_temp = [col + "_" + str(i) for i in range(vec_size)]
+                embed_col_names.extend(embed_col_names_temp)
+                col_names = list(
+                    chain.from_iterable(
+                        embed_col_names_temp if item == col else [item]
+                        for item in col_names
+                    )
+                )
+
+            if target_col:
+                if new_embed_col_list:
+                    return (
+                        pd.DataFrame(data=X_vec, columns=col_names),
+                        df[target_col],
+                        embed_col_names,
+                    )
+                else:
+                    return pd.DataFrame(data=X_vec, columns=col_names), df[target_col]
             else:
-                return pd.DataFrame(data=X_vec, columns=col_names)
+                if new_embed_col_list:
+                    return pd.DataFrame(data=X_vec, columns=col_names), embed_col_names
+                else:
+                    return pd.DataFrame(data=X_vec, columns=col_names)
 
     def fit_transform(
         self, df: pd.DataFrame, target_col: Optional[str] = None
diff --git a/pytorch_widedeep/training/trainer.py b/pytorch_widedeep/training/trainer.py
index 59b3d1c..ea38d76 100644
--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
@@ -1242,7 +1242,7 @@ class Trainer:
         else:
             self.metric = None
         sorted_callbacks_list = deepcopy(self.callbacks)
-        for obj,i in zip(self.callbacks[::-1], range(len(self.callbacks))[::-1]):
+        for obj, i in zip(self.callbacks[::-1], range(len(self.callbacks))[::-1]):
             if isinstance(obj, RayTuneReporter):
                 sorted_callbacks_list.append(sorted_callbacks_list.pop(i))
         self.callbacks = sorted_callbacks_list
diff --git a/setup.py b/setup.py
index 0df8b7d..2f72687 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,7 @@ setup_kwargs = {
         "einops",
         "wrapt",
         "torchmetrics",
-        "ray",
+        "ray[tune]",
     ],
     "extras_require": extras,
     "python_requires": ">=3.7.0",
-- 
GitLab