Merge pull request #8 from jrzaurin/codestyle

Codestyle

Merge pull request #8 from jrzaurin/codestyle
Codestyle
263663e5 · Javier · GitHub · 9a969dc0 · bd4b19fb · 263663e5
64 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -9,15 +9,22 @@ __pycache__*
 # jupyter / ipython
 .ipynb_checkpoints
 Untitled*.ipynb
+
+# data related dirs
 data/
 model_weights/
+weights/

 # tests
 .coverage
+.pytest_cache

 # sublime
 *.sublime-workspace
 sftp*-config.json

 # misc
-.DS_store
\ No newline at end of file
+.DS_store
+
+#mypy
+.mypy_cache
\ No newline at end of file
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,6 +3,26 @@ language: python
 python:
  - "3.6"
  - "3.7"
+
+matrix:
+  fast_finish: true
+    - name: "Code Style (Black/Flake8)"
+      install:
+        - pip install --upgrade pip
+        - pip install black
+        - pip install flake8
+      script:
+        # Black code style
+        - black --check --diff pytorch_widedeep tests examples setup.py
+        # Stop the build if there are Python syntax errors or undefined names
+        - flake8 . --count --select=E901,E999,F821,F822,F823 --ignore=E266 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901 --statistics
+    # - name: "static type check"
+    #   script:
+    #     -  mypy pytorch_widedeep --ignore-missing-imports  --no-strict-optional
+      after_success: skip
+
 install:
  - pip install --upgrade pip
  - pip install .

--- a/docs/01_Preprocessors_and_utils.md
+++ b/docs/01_Preprocessors_and_utils.md
@@ -586,10 +586,10 @@ texts[:2]


 ```python
-text_preprocessor = TextPreprocessor()
-X_text = text_preprocessor.fit_transform(df, text_col='description')
+text_preprocessor = TextPreprocessor(text_col='description')
+X_text = text_preprocessor.fit_transform(df)
 # From here on, any new observation can be prepared by simply running `.transform`
-# new_X_text = text_preprocessor.transform(new_df, text_col='description')
+# new_X_text = text_preprocessor.transform(new_df)
 ```

    The vocabulary contains 6400 tokens
@@ -1681,21 +1681,21 @@ padded_seq[0]


 ```python
-image_preprocessor = wd.preprocessing.ImagePreprocessor()
-X_images = image_preprocessor.fit_transform(df, img_col="id", img_path="data/airbnb/property_picture/")
+image_preprocessor = wd.preprocessing.ImagePreprocessor(img_col='id', img_path="data/airbnb/property_picture/")
+X_images = image_preprocessor.fit_transform(df)
 # From here on, any new observation can be prepared by simply running `.transform`
-# new_X_images = image_preprocessor.transform(new_df, img_col="id", img_path="data/airbnb/property_picture/")
+# new_X_images = image_preprocessor.transform(new_df)
 ```

    Reading Images from data/airbnb/property_picture/


-      8%|▊         | 83/1001 [00:00<00:02, 410.41it/s]
+      4%|▍         | 40/1001 [00:00<00:02, 391.71it/s]

    Resizing


-    100%|██████████| 1001/1001 [00:02<00:00, 419.09it/s]
+    100%|██████████| 1001/1001 [00:02<00:00, 415.12it/s]


    Computing normalisation metrics
@@ -1731,7 +1731,7 @@ prop_imgnames = sample(prop_imgnames, 10)
 print(prop_imgnames)
 ```

-    ['510940.jpg', '469775.jpg', '499555.jpg', '519598.jpg', '429444.jpg', '391898.jpg', '550146.jpg', '294247.jpg', '469630.jpg', '369374.jpg']
+    ['512853.jpg', '460396.jpg', '92352.jpg', '472203.jpg', '534665.jpg', '529070.jpg', '549281.jpg', '499163.jpg', '218915.jpg', '526627.jpg']



@@ -1768,7 +1768,7 @@ for i,im in enumerate(prop_imgs):
 print([im.shape for im in prop_imgs])
 ```

-    [(426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3)]
+    [(426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 360, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3), (426, 639, 3)]




--- a/docs/02_Model_Components.md
+++ b/docs/02_Model_Components.md
@@ -113,11 +113,11 @@ deepdense(X_deep)



-    tensor([[ 0.0000, -0.0000, -2.2326,  3.5109, -0.0000, -1.2939, -0.0000, -0.0000],
-            [-1.1958, -0.0000, -0.0000, -0.0000,  0.0000,  0.0000, -0.0000, -0.0000],
-            [-0.0000, -0.0000,  0.0000,  0.0000, -1.4503, -0.0000, -0.0000,  3.9983],
-            [-1.0092,  0.5101, -0.0000, -1.4664, -0.0000, -0.0000, -1.0064, -0.0000],
-            [-1.1821,  0.0000,  0.0000, -1.5362,  0.0000,  0.0000,  3.9992, -0.0000]],
+    tensor([[ 1.9317, -0.0000,  1.3663, -0.3984, -0.0000, -0.0000, -0.0000, -1.2662],
+            [ 0.0000, -1.5337, -0.0000,  0.0726, -0.4231,  3.9977, -0.0000, -0.0000],
+            [-0.0000, -1.5839,  3.2978, -1.7084, -1.0877, -0.9574,  0.0000, -0.0000],
+            [-0.0000,  1.6664, -1.6006,  0.0000, -0.0000, -0.9844, -0.0000, -0.0521],
+            [ 2.4249,  0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  2.6460,  0.0000]],
           grad_fn=<MulBackward0>)


@@ -339,10 +339,10 @@ deepimage(X_img)



-    tensor([[-1.7624e-03,  8.7372e-03, -3.9061e-03,  2.2110e-01, -1.9655e-03,
-              7.9629e-02,  2.5455e-01, -1.6910e-03],
-            [-2.8030e-05,  1.6680e-01, -3.5123e-03,  1.5065e-01,  5.2558e-02,
-              5.4472e-02,  4.2029e-02,  2.3403e-02]], grad_fn=<LeakyReluBackward1>)
+    tensor([[ 8.4865e-02, -3.4401e-03, -9.1973e-04,  3.4269e-01,  3.2816e-02,
+              1.9682e-02, -8.0740e-04,  9.4898e-03],
+            [ 1.5473e-01, -6.2664e-03, -9.3413e-05,  3.8768e-01, -1.9963e-03,
+              1.1729e-01, -2.7111e-03,  1.8670e-01]], grad_fn=<LeakyReluBackward1>)




--- a/docs/03_Binary_Classification_with_Defaults.md
+++ b/docs/03_Binary_Classification_with_Defaults.md
@@ -307,8 +307,7 @@ Have a look to notebooks one and two if you want to get a good understanding of
 ```python
 wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']
 crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
-cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16),
-    ('occupation',16),('native_country',16)]
+cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16), ('occupation',16),('native_country',16)]
 continuous_cols = ["age","hours_per_week"]
 target_col = 'income_label'
 ```
@@ -445,16 +444,16 @@ model.fit(X_wide=X_wide, X_deep=X_deep, target=target, n_epochs=5, batch_size=25
    Training


-    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 55.95it/s, loss=0.41, metrics={'acc': 0.811}]  
-    valid: 100%|██████████| 39/39 [00:00<00:00, 119.88it/s, loss=0.362, metrics={'acc': 0.8152}]
-    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 58.75it/s, loss=0.35, metrics={'acc': 0.8347}] 
-    valid: 100%|██████████| 39/39 [00:00<00:00, 99.19it/s, loss=0.353, metrics={'acc': 0.8352}]
-    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.79it/s, loss=0.344, metrics={'acc': 0.8372}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 113.76it/s, loss=0.349, metrics={'acc': 0.8376}]
-    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 56.89it/s, loss=0.341, metrics={'acc': 0.8389}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 104.95it/s, loss=0.347, metrics={'acc': 0.8388}]
-    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 56.49it/s, loss=0.338, metrics={'acc': 0.8404}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 105.79it/s, loss=0.346, metrics={'acc': 0.8405}]
+    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 56.52it/s, loss=0.412, metrics={'acc': 0.7993}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 123.12it/s, loss=0.352, metrics={'acc': 0.8071}]
+    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 59.55it/s, loss=0.351, metrics={'acc': 0.8351}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 121.98it/s, loss=0.346, metrics={'acc': 0.8359}]
+    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.82it/s, loss=0.346, metrics={'acc': 0.8377}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 121.88it/s, loss=0.344, metrics={'acc': 0.8384}]
+    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 58.97it/s, loss=0.342, metrics={'acc': 0.8392}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 122.20it/s, loss=0.342, metrics={'acc': 0.84}] 
+    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 58.28it/s, loss=0.34, metrics={'acc': 0.8406}] 
+    valid: 100%|██████████| 39/39 [00:00<00:00, 116.57it/s, loss=0.341, metrics={'acc': 0.8413}]


 As you can see, you can run a wide and deep model in just a few lines of code
--- a/docs/04_Binary_Classification_Varying_Parameters.md
+++ b/docs/04_Binary_Classification_Varying_Parameters.md
@@ -307,8 +307,7 @@ Have a look to notebooks one and two if you want to get a good understanding of
 ```python
 wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']
 crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
-cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16),
-    ('occupation',16),('native_country',16)]
+cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16), ('occupation',16),('native_country',16)]
 continuous_cols = ["age","hours_per_week"]
 target_col = 'income_label'
 ```
@@ -477,26 +476,26 @@ model.fit(X_wide=X_wide, X_deep=X_deep, target=target, n_epochs=10, batch_size=2
    Training


-    epoch 1: 100%|██████████| 153/153 [00:03<00:00, 44.53it/s, loss=0.783, metrics={'acc': 0.6151}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 116.60it/s, loss=0.469, metrics={'acc': 0.6493}]
-    epoch 2: 100%|██████████| 153/153 [00:03<00:00, 47.32it/s, loss=0.529, metrics={'acc': 0.7565}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 95.90it/s, loss=0.396, metrics={'acc': 0.7685}]
-    epoch 3: 100%|██████████| 153/153 [00:03<00:00, 46.55it/s, loss=0.457, metrics={'acc': 0.7907}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 116.63it/s, loss=0.372, metrics={'acc': 0.798}]
-    epoch 4: 100%|██████████| 153/153 [00:03<00:00, 49.69it/s, loss=0.421, metrics={'acc': 0.8038}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 128.04it/s, loss=0.366, metrics={'acc': 0.8091}]
-    epoch 5: 100%|██████████| 153/153 [00:03<00:00, 50.27it/s, loss=0.398, metrics={'acc': 0.815}] 
-    valid: 100%|██████████| 39/39 [00:00<00:00, 131.81it/s, loss=0.36, metrics={'acc': 0.8188}]
-    epoch 6: 100%|██████████| 153/153 [00:03<00:00, 50.16it/s, loss=0.388, metrics={'acc': 0.817}] 
-    valid: 100%|██████████| 39/39 [00:00<00:00, 130.33it/s, loss=0.36, metrics={'acc': 0.8204}]
-    epoch 7: 100%|██████████| 153/153 [00:03<00:00, 50.06it/s, loss=0.386, metrics={'acc': 0.8175}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 133.34it/s, loss=0.359, metrics={'acc': 0.8208}]
-    epoch 8: 100%|██████████| 153/153 [00:03<00:00, 50.43it/s, loss=0.387, metrics={'acc': 0.8189}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 124.92it/s, loss=0.359, metrics={'acc': 0.8221}]
-    epoch 9: 100%|██████████| 153/153 [00:03<00:00, 50.34it/s, loss=0.385, metrics={'acc': 0.8185}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 130.44it/s, loss=0.358, metrics={'acc': 0.8219}]
-    epoch 10: 100%|██████████| 153/153 [00:03<00:00, 50.29it/s, loss=0.384, metrics={'acc': 0.8191}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 129.58it/s, loss=0.358, metrics={'acc': 0.8225}]
+    epoch 1: 100%|██████████| 153/153 [00:03<00:00, 47.06it/s, loss=0.731, metrics={'acc': 0.6468}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 118.07it/s, loss=0.418, metrics={'acc': 0.6785}]
+    epoch 2: 100%|██████████| 153/153 [00:03<00:00, 49.72it/s, loss=0.51, metrics={'acc': 0.7637}] 
+    valid: 100%|██████████| 39/39 [00:00<00:00, 114.38it/s, loss=0.376, metrics={'acc': 0.7765}]
+    epoch 3: 100%|██████████| 153/153 [00:03<00:00, 49.32it/s, loss=0.448, metrics={'acc': 0.7927}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 113.23it/s, loss=0.361, metrics={'acc': 0.8007}]
+    epoch 4: 100%|██████████| 153/153 [00:03<00:00, 48.23it/s, loss=0.413, metrics={'acc': 0.8079}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 118.17it/s, loss=0.355, metrics={'acc': 0.8132}]
+    epoch 5: 100%|██████████| 153/153 [00:03<00:00, 48.27it/s, loss=0.395, metrics={'acc': 0.8149}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 114.71it/s, loss=0.352, metrics={'acc': 0.8191}]
+    epoch 6: 100%|██████████| 153/153 [00:03<00:00, 48.97it/s, loss=0.387, metrics={'acc': 0.8157}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 116.88it/s, loss=0.352, metrics={'acc': 0.8199}]
+    epoch 7: 100%|██████████| 153/153 [00:03<00:00, 48.56it/s, loss=0.388, metrics={'acc': 0.8153}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 116.33it/s, loss=0.352, metrics={'acc': 0.8195}]
+    epoch 8: 100%|██████████| 153/153 [00:03<00:00, 48.54it/s, loss=0.383, metrics={'acc': 0.8184}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 93.05it/s, loss=0.351, metrics={'acc': 0.822}] 
+    epoch 9: 100%|██████████| 153/153 [00:03<00:00, 45.71it/s, loss=0.385, metrics={'acc': 0.8196}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 110.48it/s, loss=0.351, metrics={'acc': 0.8229}]
+    epoch 10: 100%|██████████| 153/153 [00:03<00:00, 48.44it/s, loss=0.382, metrics={'acc': 0.8194}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 114.40it/s, loss=0.35, metrics={'acc': 0.8228}]



@@ -646,7 +645,7 @@ model.history.epoch
 print(model.history._history)
 ```

-    {'train_loss': [0.7826161832591287, 0.5294494130253012, 0.45743006565212424, 0.4206276263286865, 0.3982163554702709, 0.3881325295158461, 0.3862898593244989, 0.38681577603801404, 0.38500378529230755, 0.38388273743243], 'train_acc': [0.6151, 0.7565, 0.7907, 0.8038, 0.815, 0.817, 0.8175, 0.8189, 0.8185, 0.8191], 'val_loss': [0.4694176025879689, 0.3960292133001181, 0.37219820802028364, 0.3658289725963886, 0.3600605313594525, 0.35951805343994725, 0.35915129765486103, 0.3585702692851042, 0.3578468553530864, 0.3576407875770178], 'val_acc': [0.6493, 0.7685, 0.798, 0.8091, 0.8188, 0.8204, 0.8208, 0.8221, 0.8219, 0.8225]}
+    {'train_loss': [0.7313813343157176, 0.5101876866583731, 0.44813506724008545, 0.41332343941420513, 0.3945406624694276, 0.3871746306715448, 0.3884129401515512, 0.38312816230300206, 0.3847907395923839, 0.3817657043341718], 'train_acc': [0.6468, 0.7637, 0.7927, 0.8079, 0.8149, 0.8157, 0.8153, 0.8184, 0.8196, 0.8194], 'val_loss': [0.41844800649545133, 0.3759944920356457, 0.36132928041311413, 0.3554159953044011, 0.3523857922126085, 0.3518377657120044, 0.35156664175864977, 0.35120767278549, 0.35089820012068135, 0.35047405576094603], 'val_acc': [0.6785, 0.7765, 0.8007, 0.8132, 0.8191, 0.8199, 0.8195, 0.822, 0.8229, 0.8228]}



@@ -669,71 +668,69 @@ model.get_embeddings(col_name='education', cat_encoding_dict=preprocess_deep.enc



-    {'11th': array([-1.08425401e-01,  5.09871461e-04,  1.25755548e-01, -1.20801523e-01,
-            -2.56043434e-01, -3.55644524e-02, -8.66190940e-02, -1.39202878e-01,
-             1.11087626e-04,  4.54997361e-01, -2.31609955e-01, -1.36443637e-02,
-             8.78131837e-02, -3.07353675e-01, -1.10240346e-02,  6.45920560e-02],
-           dtype=float32),
-     'HS-grad': array([ 0.19832617,  0.12040217, -0.5314197 ,  0.35005897, -0.15391229,
-            -0.22196807,  0.09345723,  0.06745315,  0.25015768,  0.08744714,
-             0.24480642, -0.08957793,  0.27947524, -0.26326123, -0.19119193,
-            -0.10995993], dtype=float32),
-     'Assoc-acdm': array([ 0.06525454, -0.2618052 , -0.09840333,  0.10541438,  0.33471954,
-            -0.04292247,  0.10712572,  0.34287837, -0.18687049, -0.13836485,
-            -0.1715912 ,  0.15273218, -0.03476759, -0.07450581,  0.56081617,
-             0.29201028], dtype=float32),
-     'Some-college': array([-0.45491776, -0.17205039,  0.21580465, -0.2539856 ,  0.02358766,
-            -0.05496917, -0.01120283,  0.09221312, -0.12831998,  0.17159238,
-             0.196605  , -0.2090644 , -0.11193639, -0.18394227, -0.16056207,
-             0.02444198], dtype=float32),
-     '10th': array([-0.5581912 , -0.20644131,  0.1300292 , -0.10135209,  0.4538276 ,
-            -0.27146348,  0.12652951,  0.5233289 ,  0.01145706, -0.05667543,
-             0.43509725, -0.74307233,  0.00139265,  0.07225899,  0.0781986 ,
-            -0.2610258 ], dtype=float32),
-     'Prof-school': array([-6.5744489e-02,  1.3956554e-01,  5.7986474e-01,  2.7874210e-01,
-            -2.4446699e-01,  7.9873689e-02, -3.8569799e-01,  2.2757685e-01,
-            -3.8109139e-02,  3.3144853e-01, -3.8229354e-02,  2.9802489e-01,
-            -1.5467829e-01,  5.4805580e-04, -2.1627106e-01, -2.6592135e-02],
-           dtype=float32),
-     '7th-8th': array([ 0.10858492,  0.42190084,  0.07536066, -0.11707054,  0.05351719,
-             0.32636967,  0.14053936,  0.45679298, -0.2558197 , -0.47910702,
-             0.4725715 , -0.0981419 ,  0.3462793 ,  0.07776859, -0.45930195,
-             0.12625834], dtype=float32),
-     'Bachelors': array([ 0.01805384, -0.10573057,  0.25564098, -0.27709666, -0.16297452,
-            -0.1851758 , -0.5702467 , -0.23569717,  0.067039  , -0.28916818,
-            -0.22313781, -0.23893505,  0.37708414,  0.17465928, -0.47459307,
-             0.04889947], dtype=float32),
-     'Masters': array([ 0.11953138,  0.11543513, -0.3954705 ,  0.32583147,  0.23851769,
-            -0.6448425 ,  0.00705628,  0.10673986, -0.08305098, -0.10872949,
-            -0.46080047, -0.05367367, -0.18693425,  0.14182107, -0.39178014,
-            -0.23969549], dtype=float32),
-     'Doctorate': array([ 0.04873321, -0.19027464, -0.10777274, -0.17476888,  0.47248197,
-            -0.2873778 , -0.29792303, -0.06811561,  0.16541322, -0.17425427,
-            -0.09404507,  0.06525683,  0.06408301,  0.38656166,  0.13369907,
-             0.10825544], dtype=float32),
-     '5th-6th': array([ 0.08566641,  0.03589746,  0.17174615,  0.08747724,  0.2698885 ,
-             0.08344392, -0.23652045,  0.31357667,  0.3546634 , -0.29814255,
-             0.10943606,  0.45218074, -0.0614133 , -0.31987205,  0.34947518,
-             0.07603104], dtype=float32),
-     'Assoc-voc': array([-0.07830544,  0.0278313 ,  0.34295908, -0.27213913, -0.20097388,
-             0.10972344,  0.14000823, -0.24098383, -0.16614872,  0.19084413,
-            -0.02334382,  0.5209352 ,  0.24089335, -0.1350642 , -0.23480216,
-            -0.32963687], dtype=float32),
-     '9th': array([ 0.12994888,  0.02475524, -0.12875263,  0.0097373 ,  0.38253692,
-            -0.2718543 ,  0.13766348,  0.27845392, -0.2036348 , -0.20567507,
-            -0.11305337, -0.47028974,  0.07009655, -0.29621345, -0.17303236,
-             0.15854478], dtype=float32),
-     '12th': array([-0.15079321, -0.26879913, -0.5159767 ,  0.30044943,  0.0295292 ,
-            -0.32494095,  0.20975012,  0.35193697, -0.5034315 , -0.14420179,
-             0.06113023,  0.22398257,  0.0087006 ,  0.09041765, -0.09754901,
-            -0.21647781], dtype=float32),
-     '1st-4th': array([-0.3199786 ,  0.10094872, -0.10035568,  0.10014401, -0.09340642,
-            -0.00761677,  0.50759906,  0.288856  , -0.18745485,  0.05442255,
-             0.6481828 ,  0.18515776,  0.21597311, -0.21534163,  0.01798662,
-            -0.22816893], dtype=float32),
-     'Preschool': array([ 0.10035816, -0.24015287,  0.00935481,  0.05356123, -0.18744251,
-            -0.39735606,  0.03849271, -0.2864288 , -0.10379744,  0.20251973,
-             0.14565234, -0.24607188, -0.14268415,  0.1209868 ,  0.04662501,
-             0.41015574], dtype=float32)}
+    {'11th': array([-0.04807916,  0.21404432,  0.12517522, -0.154123  ,  0.06864536,
+             0.00092955, -0.38516527, -0.18440197,  0.15861034,  0.12012056,
+             0.55413646, -0.16920644,  0.1356924 , -0.37921003,  0.53833497,
+             0.08743049], dtype=float32),
+     'HS-grad': array([ 0.37504154,  0.34191516,  0.27299362,  0.22921972,  0.07420117,
+             0.34922913,  0.19239122, -0.42343035, -0.845824  , -0.07287297,
+             0.27455565,  0.19505064,  0.07062761, -0.5201107 ,  0.37823108,
+             0.46134958], dtype=float32),
+     'Assoc-acdm': array([ 0.22331461,  0.15005238,  0.13472553, -0.16886246, -0.12053325,
+            -0.04233408, -0.08905135, -0.54481906,  0.24300168, -0.21069968,
+            -0.00685616, -0.38423738, -0.00281451,  0.10599079, -0.05224385,
+             0.2891064 ], dtype=float32),
+     'Some-college': array([-0.09498356, -0.16801773, -0.09181987,  0.05381393, -0.03607363,
+            -0.05759075,  0.09382061,  0.33274302, -0.11906563,  0.14481838,
+            -0.1765725 ,  0.20070277,  0.2960993 , -0.02055654,  0.26645136,
+             0.4075843 ], dtype=float32),
+     '10th': array([-0.12961714, -0.27546212,  0.24345328, -0.24318363,  0.31552687,
+             0.16653115, -0.05234893,  0.06825106,  0.2388588 ,  0.10887478,
+            -0.12004007, -0.00373614, -0.0223387 ,  0.133562  ,  0.29672143,
+             0.03046475], dtype=float32),
+     'Prof-school': array([-0.1589678 , -0.07629952,  0.00763621,  0.13788143,  0.4114019 ,
+             0.07717889, -0.17072953,  0.29419565, -0.18929462, -0.09182461,
+            -0.08409152,  0.01395322, -0.20351669,  0.18333136, -0.03983613,
+            -0.31888708], dtype=float32),
+     '7th-8th': array([ 0.39654806,  0.26095334, -0.3147828 , -0.41267306, -0.23983437,
+            -0.08034727,  0.4807234 ,  0.3054779 , -0.3085564 , -0.07860225,
+            -0.1279486 , -0.2846014 ,  0.1358583 ,  0.24006395, -0.18911272,
+            -0.2299538 ], dtype=float32),
+     'Bachelors': array([ 0.35242578, -0.03246311,  0.15835243, -0.06434399,  0.03403192,
+             0.0088449 ,  0.00627425, -0.31485453, -0.30984947, -0.23008366,
+            -0.09467663,  0.17246258, -0.09432375,  0.07691337,  0.70925283,
+             0.18795769], dtype=float32),
+     'Masters': array([-0.14503758,  0.0048258 ,  0.58242404,  0.28511924, -0.13773848,
+             0.35109136,  0.05824559,  0.3609631 ,  0.4700086 ,  0.4251728 ,
+            -0.2538366 , -0.00297809,  0.1424264 , -0.12481072, -0.09403807,
+             0.00634856], dtype=float32),
+     'Doctorate': array([-0.12487873, -0.1699961 ,  0.2220065 , -0.04808738,  0.09443628,
+            -0.21019349, -0.23745097,  0.28523713,  0.05516997, -0.04004707,
+             0.3316393 ,  0.18710822,  0.4153885 , -0.12905155,  0.03055826,
+             0.0664137 ], dtype=float32),
+     '5th-6th': array([ 0.21891987, -0.13600409, -0.03123563,  0.16288632, -0.03479639,
+            -0.4221951 ,  0.4688111 ,  0.08145971, -0.29254073,  0.18396533,
+            -0.20204993, -0.03327556, -0.2558647 ,  0.56448   , -0.30299884,
+             0.07629355], dtype=float32),
+     'Assoc-voc': array([-0.01987046, -0.06434393,  0.00226   ,  0.08150155, -0.33775425,
+            -0.13507745,  0.12741297,  0.0542295 ,  0.09895965,  0.067229  ,
+            -0.1718493 ,  0.01054914,  0.10441845, -0.18814586, -0.01663602,
+             0.03088147], dtype=float32),
+     '9th': array([-0.24095939,  0.2750888 ,  0.01418325, -0.36754113,  0.5431856 ,
+            -0.19582956,  0.03485603,  0.22838333, -0.05723334,  0.10631263,
+             0.06331363, -0.09572615,  0.21977316, -0.02579625, -0.13822857,
+             0.28736743], dtype=float32),
+     '12th': array([-0.20278502, -0.19245535, -0.04846343,  0.14459866,  0.25858438,
+             0.15333128,  0.5074635 , -0.15141617, -0.19331448, -0.2630267 ,
+            -0.1378872 , -0.16868882,  0.4048257 , -0.34108582, -0.23098588,
+             0.2859633 ], dtype=float32),
+     '1st-4th': array([-0.53678703,  0.19669479, -0.18026853,  0.33791658,  0.14260627,
+             0.20269199,  0.00518189,  0.01120056,  0.01568659,  0.28752655,
+             0.3359768 ,  0.01758064,  0.11630564, -0.35470524, -0.05704446,
+             0.41216984], dtype=float32),
+     'Preschool': array([ 0.10326536, -0.02895411,  0.11348445,  0.03685748,  0.55893034,
+            -0.2522173 , -0.07186767, -0.30955225, -0.17825711,  0.02907414,
+            -0.61121726,  0.40596214,  0.63471395,  0.3304132 ,  0.05272925,
+            -0.4266447 ], dtype=float32)}


--- a/docs/05_Regression_with_Images_and_Text.md
+++ b/docs/05_Regression_with_Images_and_Text.md
@@ -729,8 +729,8 @@ X_deep = deep_preprocessor.fit_transform(df)


 ```python
-text_preprocessor = TextPreprocessor(word_vectors_path=word_vectors_path)
-X_text = text_preprocessor.fit_transform(df, text_col)
+text_preprocessor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col)
+X_text = text_preprocessor.fit_transform(df)
 ```

    The vocabulary contains 6400 tokens
@@ -742,19 +742,19 @@ X_text = text_preprocessor.fit_transform(df, text_col)


 ```python
-image_processor = ImagePreprocessor()
-X_images = image_processor.fit_transform(df, img_col, img_path)
+image_processor = ImagePreprocessor(img_col = img_col, img_path = img_path)
+X_images = image_processor.fit_transform(df)
 ```

    Reading Images from data/airbnb/property_picture


-      4%|▍         | 43/1001 [00:00<00:02, 428.36it/s]
+      9%|▊         | 87/1001 [00:00<00:02, 428.89it/s]

    Resizing


-    100%|██████████| 1001/1001 [00:02<00:00, 425.98it/s]
+    100%|██████████| 1001/1001 [00:02<00:00, 426.92it/s]


    Computing normalisation metrics
@@ -804,8 +804,8 @@ model.fit(X_wide=X_wide, X_deep=X_deep, X_text=X_text, X_img=X_images,
    Training


-    epoch 1: 100%|██████████| 25/25 [02:04<00:00,  4.97s/it, loss=117]
-    valid: 100%|██████████| 7/7 [00:14<00:00,  2.00s/it, loss=122]
+    epoch 1: 100%|██████████| 25/25 [02:06<00:00,  5.05s/it, loss=118]
+    valid: 100%|██████████| 7/7 [00:14<00:00,  2.01s/it, loss=226]


 ### Regression with varying parameters and a FC-Head receiving the full deep side
@@ -1050,8 +1050,8 @@ model.compile(method='regression', initializers=initializers, optimizers=optimiz
    lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
 ```

-    /Users/javier/pytorch-widedeep/pytorch_widedeep/initializers.py:32: UserWarning: No initializer found for deephead
-      if self.verbose: warnings.warn("No initializer found for {}".format(name))
+    /Users/javier/pytorch-widedeep/pytorch_widedeep/initializers.py:31: UserWarning: No initializer found for deephead
+      warnings.warn("No initializer found for {}".format(name))



@@ -1221,8 +1221,8 @@ model.fit(X_wide=X_wide, X_deep=X_deep, X_text=X_text, X_img=X_images,
    Training


-    epoch 1: 100%|██████████| 25/25 [02:08<00:00,  5.15s/it, loss=128]
-    valid: 100%|██████████| 7/7 [00:14<00:00,  2.05s/it, loss=95.5]
+    epoch 1: 100%|██████████| 25/25 [02:04<00:00,  4.97s/it, loss=127]
+    valid: 100%|██████████| 7/7 [00:14<00:00,  2.02s/it, loss=94]  


 we have only run one epoch, but let's check that the LRHistory callback records the lr values for each group

--- a/docs/06_WarmUp_Model_Components.md
+++ b/docs/06_WarmUp_Model_Components.md
@@ -92,36 +92,36 @@ model.fit(X_wide=X_wide, X_deep=X_deep, target=target, n_epochs=5, batch_size=25
    Warming up wide for 5 epochs


-    epoch 1: 100%|██████████| 153/153 [00:01<00:00, 133.55it/s, loss=0.469, metrics={'acc': 0.7963}]
-    epoch 2: 100%|██████████| 153/153 [00:00<00:00, 156.42it/s, loss=0.372, metrics={'acc': 0.8119}]
-    epoch 3: 100%|██████████| 153/153 [00:01<00:00, 152.65it/s, loss=0.363, metrics={'acc': 0.8183}]
-    epoch 4: 100%|██████████| 153/153 [00:01<00:00, 144.01it/s, loss=0.36, metrics={'acc': 0.8218}] 
-    epoch 5: 100%|██████████| 153/153 [00:00<00:00, 155.56it/s, loss=0.359, metrics={'acc': 0.8241}]
+    epoch 1: 100%|██████████| 153/153 [00:01<00:00, 131.88it/s, loss=0.471, metrics={'acc': 0.7946}]
+    epoch 2: 100%|██████████| 153/153 [00:00<00:00, 154.81it/s, loss=0.373, metrics={'acc': 0.8115}]
+    epoch 3: 100%|██████████| 153/153 [00:01<00:00, 151.56it/s, loss=0.364, metrics={'acc': 0.8182}]
+    epoch 4: 100%|██████████| 153/153 [00:00<00:00, 154.22it/s, loss=0.362, metrics={'acc': 0.8216}]
+    epoch 5: 100%|██████████| 153/153 [00:01<00:00, 152.65it/s, loss=0.36, metrics={'acc': 0.8238}] 
      0%|          | 0/153 [00:00<?, ?it/s]

    Warming up deepdense for 5 epochs


-    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 64.03it/s, loss=0.4, metrics={'acc': 0.8214}]  
-    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 64.30it/s, loss=0.347, metrics={'acc': 0.8237}]
-    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 64.18it/s, loss=0.341, metrics={'acc': 0.8258}]
-    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 64.98it/s, loss=0.337, metrics={'acc': 0.8277}]
-    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 65.70it/s, loss=0.333, metrics={'acc': 0.8294}]
+    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 64.04it/s, loss=0.395, metrics={'acc': 0.8222}]
+    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 65.34it/s, loss=0.349, metrics={'acc': 0.8242}]
+    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 65.05it/s, loss=0.343, metrics={'acc': 0.8262}]
+    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 64.93it/s, loss=0.339, metrics={'acc': 0.8279}]
+    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 65.15it/s, loss=0.335, metrics={'acc': 0.8295}]
      0%|          | 0/153 [00:00<?, ?it/s]

    Training


-    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 58.97it/s, loss=0.343, metrics={'acc': 0.8442}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 123.14it/s, loss=0.349, metrics={'acc': 0.8436}]
-    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 59.48it/s, loss=0.333, metrics={'acc': 0.8457}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 114.05it/s, loss=0.346, metrics={'acc': 0.8447}]
-    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.82it/s, loss=0.331, metrics={'acc': 0.8471}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 122.31it/s, loss=0.345, metrics={'acc': 0.8457}]
-    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 59.20it/s, loss=0.329, metrics={'acc': 0.8474}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 113.66it/s, loss=0.345, metrics={'acc': 0.8459}]
-    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 59.15it/s, loss=0.328, metrics={'acc': 0.8479}]
-    valid: 100%|██████████| 39/39 [00:00<00:00, 118.58it/s, loss=0.345, metrics={'acc': 0.8462}]
+    epoch 1: 100%|██████████| 153/153 [00:02<00:00, 58.31it/s, loss=0.345, metrics={'acc': 0.8415}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 120.77it/s, loss=0.346, metrics={'acc': 0.8416}]
+    epoch 2: 100%|██████████| 153/153 [00:02<00:00, 58.33it/s, loss=0.335, metrics={'acc': 0.8446}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 117.88it/s, loss=0.344, metrics={'acc': 0.8438}]
+    epoch 3: 100%|██████████| 153/153 [00:02<00:00, 58.43it/s, loss=0.331, metrics={'acc': 0.8457}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 109.26it/s, loss=0.343, metrics={'acc': 0.8449}]
+    epoch 4: 100%|██████████| 153/153 [00:02<00:00, 58.08it/s, loss=0.329, metrics={'acc': 0.8457}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 120.23it/s, loss=0.344, metrics={'acc': 0.8446}]
+    epoch 5: 100%|██████████| 153/153 [00:02<00:00, 58.75it/s, loss=0.327, metrics={'acc': 0.8464}]
+    valid: 100%|██████████| 39/39 [00:00<00:00, 119.22it/s, loss=0.344, metrics={'acc': 0.8453}]


 ### Warm up Gradually: The "felbo"  and the "howard" routines
@@ -187,11 +187,11 @@ X_wide = prepare_wide.fit_transform(df)
 prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
 X_deep = prepare_deep.fit_transform(df)

-text_processor = TextPreprocessor(word_vectors_path=word_vectors_path)
-X_text = text_processor.fit_transform(df, text_col)
+text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col)
+X_text = text_processor.fit_transform(df)

-image_processor = ImagePreprocessor()
-X_images = image_processor.fit_transform(df, img_col, img_path)
+image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
+X_images = image_processor.fit_transform(df)
 ```

    The vocabulary contains 6400 tokens
@@ -202,12 +202,12 @@ X_images = image_processor.fit_transform(df, img_col, img_path)
    Reading Images from data/airbnb/property_picture


-      8%|▊         | 84/1001 [00:00<00:02, 413.82it/s]
+      3%|▎         | 29/1001 [00:00<00:03, 282.66it/s]

    Resizing


-    100%|██████████| 1001/1001 [00:02<00:00, 419.78it/s]
+    100%|██████████| 1001/1001 [00:03<00:00, 327.44it/s]


    Computing normalisation metrics
@@ -501,56 +501,56 @@ model.fit(X_wide=X_wide, X_deep=X_deep, X_text=X_text, X_img=X_images, target=ta
    Warming up wide for 1 epochs


-    epoch 1: 100%|██████████| 25/25 [00:00<00:00, 42.89it/s, loss=127]
+    epoch 1: 100%|██████████| 25/25 [00:00<00:00, 57.98it/s, loss=127]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepdense for 1 epochs


-    epoch 1: 100%|██████████| 25/25 [00:00<00:00, 43.28it/s, loss=117]
+    epoch 1: 100%|██████████| 25/25 [00:00<00:00, 45.81it/s, loss=116]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deeptext for 1 epochs


-    epoch 1: 100%|██████████| 25/25 [00:05<00:00,  4.78it/s, loss=132]
+    epoch 1: 100%|██████████| 25/25 [00:04<00:00,  5.37it/s, loss=132]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepimage, layer 1 of 5


-    epoch 1: 100%|██████████| 25/25 [01:11<00:00,  2.86s/it, loss=119]
+    epoch 1: 100%|██████████| 25/25 [01:10<00:00,  2.83s/it, loss=119]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepimage, layer 2 of 5


-    epoch 1: 100%|██████████| 25/25 [01:35<00:00,  3.81s/it, loss=108]
+    epoch 1: 100%|██████████| 25/25 [01:34<00:00,  3.76s/it, loss=108]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepimage, layer 3 of 5


-    epoch 1: 100%|██████████| 25/25 [01:58<00:00,  4.76s/it, loss=105]
+    epoch 1: 100%|██████████| 25/25 [01:57<00:00,  4.69s/it, loss=106]
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepimage, layer 4 of 5


-    epoch 1: 100%|██████████| 25/25 [02:25<00:00,  5.80s/it, loss=105]
+    epoch 1: 100%|██████████| 25/25 [02:24<00:00,  5.79s/it, loss=105] 
      0%|          | 0/25 [00:00<?, ?it/s]

    Warming up deepimage, layer 5 of 5


-    epoch 1: 100%|██████████| 25/25 [03:04<00:00,  7.38s/it, loss=106] 
+    epoch 1: 100%|██████████| 25/25 [03:01<00:00,  7.26s/it, loss=105] 
      0%|          | 0/25 [00:00<?, ?it/s]

    Training


-    epoch 1: 100%|██████████| 25/25 [02:05<00:00,  5.00s/it, loss=130]
-    valid: 100%|██████████| 7/7 [00:14<00:00,  2.04s/it, loss=125] 
+    epoch 1: 100%|██████████| 25/25 [02:05<00:00,  5.03s/it, loss=129]
+    valid: 100%|██████████| 7/7 [00:14<00:00,  2.11s/it, loss=103] 


 And one would access to the `felbo` routine by changing the `param`, `warm_routine` to `'felbo'` 
--- a/docs/figures/01_Preprocessors_and_utils_40_0.png
+++ b/docs/figures/01_Preprocessors_and_utils_40_0.png
--- a/docs/figures/01_Preprocessors_and_utils_43_0.png
+++ b/docs/figures/01_Preprocessors_and_utils_43_0.png
--- a/docs/figures/01_Preprocessors_and_utils_46_0.png
+++ b/docs/figures/01_Preprocessors_and_utils_46_0.png
--- a/examples/01_Preprocessors_and_utils.ipynb
+++ b/examples/01_Preprocessors_and_utils.ipynb
--- a/examples/02_Model_Components.ipynb
+++ b/examples/02_Model_Components.ipynb
@@ -46,7 +46,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@@ -57,7 +57,7 @@
       ")"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -78,7 +78,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -89,7 +89,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -112,7 +112,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -122,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@@ -153,7 +153,7 @@
       ")"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -164,21 +164,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "tensor([[ 0.0000, -0.0000, -2.2326,  3.5109, -0.0000, -1.2939, -0.0000, -0.0000],\n",
-       "        [-1.1958, -0.0000, -0.0000, -0.0000,  0.0000,  0.0000, -0.0000, -0.0000],\n",
-       "        [-0.0000, -0.0000,  0.0000,  0.0000, -1.4503, -0.0000, -0.0000,  3.9983],\n",
-       "        [-1.0092,  0.5101, -0.0000, -1.4664, -0.0000, -0.0000, -1.0064, -0.0000],\n",
-       "        [-1.1821,  0.0000,  0.0000, -1.5362,  0.0000,  0.0000,  3.9992, -0.0000]],\n",
+       "tensor([[ 1.9317, -0.0000,  1.3663, -0.3984, -0.0000, -0.0000, -0.0000, -1.2662],\n",
+       "        [ 0.0000, -1.5337, -0.0000,  0.0726, -0.4231,  3.9977, -0.0000, -0.0000],\n",
+       "        [-0.0000, -1.5839,  3.2978, -1.7084, -1.0877, -0.9574,  0.0000, -0.0000],\n",
+       "        [-0.0000,  1.6664, -1.6006,  0.0000, -0.0000, -0.9844, -0.0000, -0.0521],\n",
+       "        [ 2.4249,  0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  2.6460,  0.0000]],\n",
       "       grad_fn=<MulBackward0>)"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -202,7 +202,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -221,7 +221,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -230,7 +230,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -239,7 +239,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -251,7 +251,7 @@
       ")"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -269,7 +269,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -279,7 +279,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -299,7 +299,7 @@
       ")"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -326,7 +326,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -344,7 +344,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -353,7 +353,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -362,7 +362,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
@@ -467,7 +467,7 @@
       ")"
      ]
     },
-     "execution_count": 22,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -478,19 +478,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "tensor([[-1.7624e-03,  8.7372e-03, -3.9061e-03,  2.2110e-01, -1.9655e-03,\n",
-       "          7.9629e-02,  2.5455e-01, -1.6910e-03],\n",
-       "        [-2.8030e-05,  1.6680e-01, -3.5123e-03,  1.5065e-01,  5.2558e-02,\n",
-       "          5.4472e-02,  4.2029e-02,  2.3403e-02]], grad_fn=<LeakyReluBackward1>)"
+       "tensor([[ 8.4865e-02, -3.4401e-03, -9.1973e-04,  3.4269e-01,  3.2816e-02,\n",
+       "          1.9682e-02, -8.0740e-04,  9.4898e-03],\n",
+       "        [ 1.5473e-01, -6.2664e-03, -9.3413e-05,  3.8768e-01, -1.9963e-03,\n",
+       "          1.1729e-01, -2.7111e-03,  1.8670e-01]], grad_fn=<LeakyReluBackward1>)"
      ]
     },
-     "execution_count": 23,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -508,7 +508,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -587,7 +587,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [

--- a/examples/03_Binary_Classification_with_Defaults.ipynb
+++ b/examples/03_Binary_Classification_with_Defaults.ipynb
@@ -387,8 +387,7 @@
   "source": [
    "wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']\n",
    "crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]\n",
-    "cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16),\n",
-    "    ('occupation',16),('native_country',16)]\n",
+    "cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16), ('occupation',16),('native_country',16)]\n",
    "continuous_cols = [\"age\",\"hours_per_week\"]\n",
    "target_col = 'income_label'"
   ]
@@ -592,16 +591,16 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 55.95it/s, loss=0.41, metrics={'acc': 0.811}]  \n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 119.88it/s, loss=0.362, metrics={'acc': 0.8152}]\n",
-      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 58.75it/s, loss=0.35, metrics={'acc': 0.8347}] \n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 99.19it/s, loss=0.353, metrics={'acc': 0.8352}]\n",
-      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.79it/s, loss=0.344, metrics={'acc': 0.8372}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 113.76it/s, loss=0.349, metrics={'acc': 0.8376}]\n",
-      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 56.89it/s, loss=0.341, metrics={'acc': 0.8389}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 104.95it/s, loss=0.347, metrics={'acc': 0.8388}]\n",
-      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 56.49it/s, loss=0.338, metrics={'acc': 0.8404}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 105.79it/s, loss=0.346, metrics={'acc': 0.8405}]\n"
+      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 56.52it/s, loss=0.412, metrics={'acc': 0.7993}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 123.12it/s, loss=0.352, metrics={'acc': 0.8071}]\n",
+      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 59.55it/s, loss=0.351, metrics={'acc': 0.8351}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 121.98it/s, loss=0.346, metrics={'acc': 0.8359}]\n",
+      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.82it/s, loss=0.346, metrics={'acc': 0.8377}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 121.88it/s, loss=0.344, metrics={'acc': 0.8384}]\n",
+      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 58.97it/s, loss=0.342, metrics={'acc': 0.8392}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 122.20it/s, loss=0.342, metrics={'acc': 0.84}] \n",
+      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 58.28it/s, loss=0.34, metrics={'acc': 0.8406}] \n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 116.57it/s, loss=0.341, metrics={'acc': 0.8413}]\n"
     ]
    }
   ],

--- a/examples/04_Binary_Classification_Varying_Parameters.ipynb
+++ b/examples/04_Binary_Classification_Varying_Parameters.ipynb
@@ -381,14 +381,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']\n",
    "crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]\n",
-    "cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16),\n",
-    "    ('occupation',16),('native_country',16)]\n",
+    "cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16), ('occupation',16),('native_country',16)]\n",
    "continuous_cols = [\"age\",\"hours_per_week\"]\n",
    "target_col = 'income_label'"
   ]
@@ -485,7 +484,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -500,7 +499,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -540,7 +539,7 @@
       ")"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -565,7 +564,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -576,7 +575,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -597,7 +596,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -612,7 +611,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -624,7 +623,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@@ -646,26 +645,26 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 153/153 [00:03<00:00, 44.53it/s, loss=0.783, metrics={'acc': 0.6151}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 116.60it/s, loss=0.469, metrics={'acc': 0.6493}]\n",
-      "epoch 2: 100%|██████████| 153/153 [00:03<00:00, 47.32it/s, loss=0.529, metrics={'acc': 0.7565}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 95.90it/s, loss=0.396, metrics={'acc': 0.7685}]\n",
-      "epoch 3: 100%|██████████| 153/153 [00:03<00:00, 46.55it/s, loss=0.457, metrics={'acc': 0.7907}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 116.63it/s, loss=0.372, metrics={'acc': 0.798}]\n",
-      "epoch 4: 100%|██████████| 153/153 [00:03<00:00, 49.69it/s, loss=0.421, metrics={'acc': 0.8038}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 128.04it/s, loss=0.366, metrics={'acc': 0.8091}]\n",
-      "epoch 5: 100%|██████████| 153/153 [00:03<00:00, 50.27it/s, loss=0.398, metrics={'acc': 0.815}] \n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 131.81it/s, loss=0.36, metrics={'acc': 0.8188}]\n",
-      "epoch 6: 100%|██████████| 153/153 [00:03<00:00, 50.16it/s, loss=0.388, metrics={'acc': 0.817}] \n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 130.33it/s, loss=0.36, metrics={'acc': 0.8204}]\n",
-      "epoch 7: 100%|██████████| 153/153 [00:03<00:00, 50.06it/s, loss=0.386, metrics={'acc': 0.8175}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 133.34it/s, loss=0.359, metrics={'acc': 0.8208}]\n",
-      "epoch 8: 100%|██████████| 153/153 [00:03<00:00, 50.43it/s, loss=0.387, metrics={'acc': 0.8189}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 124.92it/s, loss=0.359, metrics={'acc': 0.8221}]\n",
-      "epoch 9: 100%|██████████| 153/153 [00:03<00:00, 50.34it/s, loss=0.385, metrics={'acc': 0.8185}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 130.44it/s, loss=0.358, metrics={'acc': 0.8219}]\n",
-      "epoch 10: 100%|██████████| 153/153 [00:03<00:00, 50.29it/s, loss=0.384, metrics={'acc': 0.8191}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 129.58it/s, loss=0.358, metrics={'acc': 0.8225}]\n"
+      "epoch 1: 100%|██████████| 153/153 [00:03<00:00, 47.06it/s, loss=0.731, metrics={'acc': 0.6468}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 118.07it/s, loss=0.418, metrics={'acc': 0.6785}]\n",
+      "epoch 2: 100%|██████████| 153/153 [00:03<00:00, 49.72it/s, loss=0.51, metrics={'acc': 0.7637}] \n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 114.38it/s, loss=0.376, metrics={'acc': 0.7765}]\n",
+      "epoch 3: 100%|██████████| 153/153 [00:03<00:00, 49.32it/s, loss=0.448, metrics={'acc': 0.7927}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 113.23it/s, loss=0.361, metrics={'acc': 0.8007}]\n",
+      "epoch 4: 100%|██████████| 153/153 [00:03<00:00, 48.23it/s, loss=0.413, metrics={'acc': 0.8079}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 118.17it/s, loss=0.355, metrics={'acc': 0.8132}]\n",
+      "epoch 5: 100%|██████████| 153/153 [00:03<00:00, 48.27it/s, loss=0.395, metrics={'acc': 0.8149}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 114.71it/s, loss=0.352, metrics={'acc': 0.8191}]\n",
+      "epoch 6: 100%|██████████| 153/153 [00:03<00:00, 48.97it/s, loss=0.387, metrics={'acc': 0.8157}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 116.88it/s, loss=0.352, metrics={'acc': 0.8199}]\n",
+      "epoch 7: 100%|██████████| 153/153 [00:03<00:00, 48.56it/s, loss=0.388, metrics={'acc': 0.8153}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 116.33it/s, loss=0.352, metrics={'acc': 0.8195}]\n",
+      "epoch 8: 100%|██████████| 153/153 [00:03<00:00, 48.54it/s, loss=0.383, metrics={'acc': 0.8184}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 93.05it/s, loss=0.351, metrics={'acc': 0.822}] \n",
+      "epoch 9: 100%|██████████| 153/153 [00:03<00:00, 45.71it/s, loss=0.385, metrics={'acc': 0.8196}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 110.48it/s, loss=0.351, metrics={'acc': 0.8229}]\n",
+      "epoch 10: 100%|██████████| 153/153 [00:03<00:00, 48.44it/s, loss=0.382, metrics={'acc': 0.8194}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 114.40it/s, loss=0.35, metrics={'acc': 0.8228}]\n"
     ]
    }
   ],
@@ -675,7 +674,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@@ -800,7 +799,7 @@
       " 'zero_grad']"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -818,7 +817,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@@ -827,7 +826,7 @@
       "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -838,14 +837,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'train_loss': [0.7826161832591287, 0.5294494130253012, 0.45743006565212424, 0.4206276263286865, 0.3982163554702709, 0.3881325295158461, 0.3862898593244989, 0.38681577603801404, 0.38500378529230755, 0.38388273743243], 'train_acc': [0.6151, 0.7565, 0.7907, 0.8038, 0.815, 0.817, 0.8175, 0.8189, 0.8185, 0.8191], 'val_loss': [0.4694176025879689, 0.3960292133001181, 0.37219820802028364, 0.3658289725963886, 0.3600605313594525, 0.35951805343994725, 0.35915129765486103, 0.3585702692851042, 0.3578468553530864, 0.3576407875770178], 'val_acc': [0.6493, 0.7685, 0.798, 0.8091, 0.8188, 0.8204, 0.8208, 0.8221, 0.8219, 0.8225]}\n"
+      "{'train_loss': [0.7313813343157176, 0.5101876866583731, 0.44813506724008545, 0.41332343941420513, 0.3945406624694276, 0.3871746306715448, 0.3884129401515512, 0.38312816230300206, 0.3847907395923839, 0.3817657043341718], 'train_acc': [0.6468, 0.7637, 0.7927, 0.8079, 0.8149, 0.8157, 0.8153, 0.8184, 0.8196, 0.8194], 'val_loss': [0.41844800649545133, 0.3759944920356457, 0.36132928041311413, 0.3554159953044011, 0.3523857922126085, 0.3518377657120044, 0.35156664175864977, 0.35120767278549, 0.35089820012068135, 0.35047405576094603], 'val_acc': [0.6785, 0.7765, 0.8007, 0.8132, 0.8191, 0.8199, 0.8195, 0.822, 0.8229, 0.8228]}\n"
     ]
    }
   ],
@@ -855,7 +854,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
@@ -881,81 +880,79 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "{'11th': array([-1.08425401e-01,  5.09871461e-04,  1.25755548e-01, -1.20801523e-01,\n",
-       "        -2.56043434e-01, -3.55644524e-02, -8.66190940e-02, -1.39202878e-01,\n",
-       "         1.11087626e-04,  4.54997361e-01, -2.31609955e-01, -1.36443637e-02,\n",
-       "         8.78131837e-02, -3.07353675e-01, -1.10240346e-02,  6.45920560e-02],\n",
-       "       dtype=float32),\n",
-       " 'HS-grad': array([ 0.19832617,  0.12040217, -0.5314197 ,  0.35005897, -0.15391229,\n",
-       "        -0.22196807,  0.09345723,  0.06745315,  0.25015768,  0.08744714,\n",
-       "         0.24480642, -0.08957793,  0.27947524, -0.26326123, -0.19119193,\n",
-       "        -0.10995993], dtype=float32),\n",
-       " 'Assoc-acdm': array([ 0.06525454, -0.2618052 , -0.09840333,  0.10541438,  0.33471954,\n",
-       "        -0.04292247,  0.10712572,  0.34287837, -0.18687049, -0.13836485,\n",
-       "        -0.1715912 ,  0.15273218, -0.03476759, -0.07450581,  0.56081617,\n",
-       "         0.29201028], dtype=float32),\n",
-       " 'Some-college': array([-0.45491776, -0.17205039,  0.21580465, -0.2539856 ,  0.02358766,\n",
-       "        -0.05496917, -0.01120283,  0.09221312, -0.12831998,  0.17159238,\n",
-       "         0.196605  , -0.2090644 , -0.11193639, -0.18394227, -0.16056207,\n",
-       "         0.02444198], dtype=float32),\n",
-       " '10th': array([-0.5581912 , -0.20644131,  0.1300292 , -0.10135209,  0.4538276 ,\n",
-       "        -0.27146348,  0.12652951,  0.5233289 ,  0.01145706, -0.05667543,\n",
-       "         0.43509725, -0.74307233,  0.00139265,  0.07225899,  0.0781986 ,\n",
-       "        -0.2610258 ], dtype=float32),\n",
-       " 'Prof-school': array([-6.5744489e-02,  1.3956554e-01,  5.7986474e-01,  2.7874210e-01,\n",
-       "        -2.4446699e-01,  7.9873689e-02, -3.8569799e-01,  2.2757685e-01,\n",
-       "        -3.8109139e-02,  3.3144853e-01, -3.8229354e-02,  2.9802489e-01,\n",
-       "        -1.5467829e-01,  5.4805580e-04, -2.1627106e-01, -2.6592135e-02],\n",
-       "       dtype=float32),\n",
-       " '7th-8th': array([ 0.10858492,  0.42190084,  0.07536066, -0.11707054,  0.05351719,\n",
-       "         0.32636967,  0.14053936,  0.45679298, -0.2558197 , -0.47910702,\n",
-       "         0.4725715 , -0.0981419 ,  0.3462793 ,  0.07776859, -0.45930195,\n",
-       "         0.12625834], dtype=float32),\n",
-       " 'Bachelors': array([ 0.01805384, -0.10573057,  0.25564098, -0.27709666, -0.16297452,\n",
-       "        -0.1851758 , -0.5702467 , -0.23569717,  0.067039  , -0.28916818,\n",
-       "        -0.22313781, -0.23893505,  0.37708414,  0.17465928, -0.47459307,\n",
-       "         0.04889947], dtype=float32),\n",
-       " 'Masters': array([ 0.11953138,  0.11543513, -0.3954705 ,  0.32583147,  0.23851769,\n",
-       "        -0.6448425 ,  0.00705628,  0.10673986, -0.08305098, -0.10872949,\n",
-       "        -0.46080047, -0.05367367, -0.18693425,  0.14182107, -0.39178014,\n",
-       "        -0.23969549], dtype=float32),\n",
-       " 'Doctorate': array([ 0.04873321, -0.19027464, -0.10777274, -0.17476888,  0.47248197,\n",
-       "        -0.2873778 , -0.29792303, -0.06811561,  0.16541322, -0.17425427,\n",
-       "        -0.09404507,  0.06525683,  0.06408301,  0.38656166,  0.13369907,\n",
-       "         0.10825544], dtype=float32),\n",
-       " '5th-6th': array([ 0.08566641,  0.03589746,  0.17174615,  0.08747724,  0.2698885 ,\n",
-       "         0.08344392, -0.23652045,  0.31357667,  0.3546634 , -0.29814255,\n",
-       "         0.10943606,  0.45218074, -0.0614133 , -0.31987205,  0.34947518,\n",
-       "         0.07603104], dtype=float32),\n",
-       " 'Assoc-voc': array([-0.07830544,  0.0278313 ,  0.34295908, -0.27213913, -0.20097388,\n",
-       "         0.10972344,  0.14000823, -0.24098383, -0.16614872,  0.19084413,\n",
-       "        -0.02334382,  0.5209352 ,  0.24089335, -0.1350642 , -0.23480216,\n",
-       "        -0.32963687], dtype=float32),\n",
-       " '9th': array([ 0.12994888,  0.02475524, -0.12875263,  0.0097373 ,  0.38253692,\n",
-       "        -0.2718543 ,  0.13766348,  0.27845392, -0.2036348 , -0.20567507,\n",
-       "        -0.11305337, -0.47028974,  0.07009655, -0.29621345, -0.17303236,\n",
-       "         0.15854478], dtype=float32),\n",
-       " '12th': array([-0.15079321, -0.26879913, -0.5159767 ,  0.30044943,  0.0295292 ,\n",
-       "        -0.32494095,  0.20975012,  0.35193697, -0.5034315 , -0.14420179,\n",
-       "         0.06113023,  0.22398257,  0.0087006 ,  0.09041765, -0.09754901,\n",
-       "        -0.21647781], dtype=float32),\n",
-       " '1st-4th': array([-0.3199786 ,  0.10094872, -0.10035568,  0.10014401, -0.09340642,\n",
-       "        -0.00761677,  0.50759906,  0.288856  , -0.18745485,  0.05442255,\n",
-       "         0.6481828 ,  0.18515776,  0.21597311, -0.21534163,  0.01798662,\n",
-       "        -0.22816893], dtype=float32),\n",
-       " 'Preschool': array([ 0.10035816, -0.24015287,  0.00935481,  0.05356123, -0.18744251,\n",
-       "        -0.39735606,  0.03849271, -0.2864288 , -0.10379744,  0.20251973,\n",
-       "         0.14565234, -0.24607188, -0.14268415,  0.1209868 ,  0.04662501,\n",
-       "         0.41015574], dtype=float32)}"
+       "{'11th': array([-0.04807916,  0.21404432,  0.12517522, -0.154123  ,  0.06864536,\n",
+       "         0.00092955, -0.38516527, -0.18440197,  0.15861034,  0.12012056,\n",
+       "         0.55413646, -0.16920644,  0.1356924 , -0.37921003,  0.53833497,\n",
+       "         0.08743049], dtype=float32),\n",
+       " 'HS-grad': array([ 0.37504154,  0.34191516,  0.27299362,  0.22921972,  0.07420117,\n",
+       "         0.34922913,  0.19239122, -0.42343035, -0.845824  , -0.07287297,\n",
+       "         0.27455565,  0.19505064,  0.07062761, -0.5201107 ,  0.37823108,\n",
+       "         0.46134958], dtype=float32),\n",
+       " 'Assoc-acdm': array([ 0.22331461,  0.15005238,  0.13472553, -0.16886246, -0.12053325,\n",
+       "        -0.04233408, -0.08905135, -0.54481906,  0.24300168, -0.21069968,\n",
+       "        -0.00685616, -0.38423738, -0.00281451,  0.10599079, -0.05224385,\n",
+       "         0.2891064 ], dtype=float32),\n",
+       " 'Some-college': array([-0.09498356, -0.16801773, -0.09181987,  0.05381393, -0.03607363,\n",
+       "        -0.05759075,  0.09382061,  0.33274302, -0.11906563,  0.14481838,\n",
+       "        -0.1765725 ,  0.20070277,  0.2960993 , -0.02055654,  0.26645136,\n",
+       "         0.4075843 ], dtype=float32),\n",
+       " '10th': array([-0.12961714, -0.27546212,  0.24345328, -0.24318363,  0.31552687,\n",
+       "         0.16653115, -0.05234893,  0.06825106,  0.2388588 ,  0.10887478,\n",
+       "        -0.12004007, -0.00373614, -0.0223387 ,  0.133562  ,  0.29672143,\n",
+       "         0.03046475], dtype=float32),\n",
+       " 'Prof-school': array([-0.1589678 , -0.07629952,  0.00763621,  0.13788143,  0.4114019 ,\n",
+       "         0.07717889, -0.17072953,  0.29419565, -0.18929462, -0.09182461,\n",
+       "        -0.08409152,  0.01395322, -0.20351669,  0.18333136, -0.03983613,\n",
+       "        -0.31888708], dtype=float32),\n",
+       " '7th-8th': array([ 0.39654806,  0.26095334, -0.3147828 , -0.41267306, -0.23983437,\n",
+       "        -0.08034727,  0.4807234 ,  0.3054779 , -0.3085564 , -0.07860225,\n",
+       "        -0.1279486 , -0.2846014 ,  0.1358583 ,  0.24006395, -0.18911272,\n",
+       "        -0.2299538 ], dtype=float32),\n",
+       " 'Bachelors': array([ 0.35242578, -0.03246311,  0.15835243, -0.06434399,  0.03403192,\n",
+       "         0.0088449 ,  0.00627425, -0.31485453, -0.30984947, -0.23008366,\n",
+       "        -0.09467663,  0.17246258, -0.09432375,  0.07691337,  0.70925283,\n",
+       "         0.18795769], dtype=float32),\n",
+       " 'Masters': array([-0.14503758,  0.0048258 ,  0.58242404,  0.28511924, -0.13773848,\n",
+       "         0.35109136,  0.05824559,  0.3609631 ,  0.4700086 ,  0.4251728 ,\n",
+       "        -0.2538366 , -0.00297809,  0.1424264 , -0.12481072, -0.09403807,\n",
+       "         0.00634856], dtype=float32),\n",
+       " 'Doctorate': array([-0.12487873, -0.1699961 ,  0.2220065 , -0.04808738,  0.09443628,\n",
+       "        -0.21019349, -0.23745097,  0.28523713,  0.05516997, -0.04004707,\n",
+       "         0.3316393 ,  0.18710822,  0.4153885 , -0.12905155,  0.03055826,\n",
+       "         0.0664137 ], dtype=float32),\n",
+       " '5th-6th': array([ 0.21891987, -0.13600409, -0.03123563,  0.16288632, -0.03479639,\n",
+       "        -0.4221951 ,  0.4688111 ,  0.08145971, -0.29254073,  0.18396533,\n",
+       "        -0.20204993, -0.03327556, -0.2558647 ,  0.56448   , -0.30299884,\n",
+       "         0.07629355], dtype=float32),\n",
+       " 'Assoc-voc': array([-0.01987046, -0.06434393,  0.00226   ,  0.08150155, -0.33775425,\n",
+       "        -0.13507745,  0.12741297,  0.0542295 ,  0.09895965,  0.067229  ,\n",
+       "        -0.1718493 ,  0.01054914,  0.10441845, -0.18814586, -0.01663602,\n",
+       "         0.03088147], dtype=float32),\n",
+       " '9th': array([-0.24095939,  0.2750888 ,  0.01418325, -0.36754113,  0.5431856 ,\n",
+       "        -0.19582956,  0.03485603,  0.22838333, -0.05723334,  0.10631263,\n",
+       "         0.06331363, -0.09572615,  0.21977316, -0.02579625, -0.13822857,\n",
+       "         0.28736743], dtype=float32),\n",
+       " '12th': array([-0.20278502, -0.19245535, -0.04846343,  0.14459866,  0.25858438,\n",
+       "         0.15333128,  0.5074635 , -0.15141617, -0.19331448, -0.2630267 ,\n",
+       "        -0.1378872 , -0.16868882,  0.4048257 , -0.34108582, -0.23098588,\n",
+       "         0.2859633 ], dtype=float32),\n",
+       " '1st-4th': array([-0.53678703,  0.19669479, -0.18026853,  0.33791658,  0.14260627,\n",
+       "         0.20269199,  0.00518189,  0.01120056,  0.01568659,  0.28752655,\n",
+       "         0.3359768 ,  0.01758064,  0.11630564, -0.35470524, -0.05704446,\n",
+       "         0.41216984], dtype=float32),\n",
+       " 'Preschool': array([ 0.10326536, -0.02895411,  0.11348445,  0.03685748,  0.55893034,\n",
+       "        -0.2522173 , -0.07186767, -0.30955225, -0.17825711,  0.02907414,\n",
+       "        -0.61121726,  0.40596214,  0.63471395,  0.3304132 ,  0.05272925,\n",
+       "        -0.4266447 ], dtype=float32)}"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }

--- a/examples/05_Regression_with_Images_and_Text.ipynb
+++ b/examples/05_Regression_with_Images_and_Text.ipynb
@@ -1022,7 +1022,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@@ -1038,13 +1038,13 @@
    }
   ],
   "source": [
-    "text_preprocessor = TextPreprocessor(word_vectors_path=word_vectors_path)\n",
-    "X_text = text_preprocessor.fit_transform(df, text_col)"
+    "text_preprocessor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col)\n",
+    "X_text = text_preprocessor.fit_transform(df)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -1058,7 +1058,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "  4%|▍         | 43/1001 [00:00<00:02, 428.36it/s]"
+      "  9%|▊         | 87/1001 [00:00<00:02, 428.89it/s]"
     ]
    },
    {
@@ -1072,7 +1072,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 1001/1001 [00:02<00:00, 425.98it/s]\n"
+      "100%|██████████| 1001/1001 [00:02<00:00, 426.92it/s]\n"
     ]
    },
    {
@@ -1084,8 +1084,8 @@
    }
   ],
   "source": [
-    "image_processor = ImagePreprocessor()\n",
-    "X_images = image_processor.fit_transform(df, img_col, img_path)"
+    "image_processor = ImagePreprocessor(img_col = img_col, img_path = img_path)\n",
+    "X_images = image_processor.fit_transform(df)"
   ]
  },
  {
@@ -1097,7 +1097,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1125,7 +1125,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1141,7 +1141,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1150,7 +1150,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@@ -1172,8 +1172,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [02:04<00:00,  4.97s/it, loss=117]\n",
-      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.00s/it, loss=122]\n"
+      "epoch 1: 100%|██████████| 25/25 [02:06<00:00,  5.05s/it, loss=118]\n",
+      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.01s/it, loss=226]\n"
     ]
    }
   ],
@@ -1193,7 +1193,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1217,7 +1217,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1233,7 +1233,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
@@ -1386,7 +1386,7 @@
       ")"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1406,7 +1406,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1420,7 +1420,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1433,7 +1433,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1446,7 +1446,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1472,15 +1472,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/Users/javier/pytorch-widedeep/pytorch_widedeep/initializers.py:32: UserWarning: No initializer found for deephead\n",
-      "  if self.verbose: warnings.warn(\"No initializer found for {}\".format(name))\n"
+      "/Users/javier/pytorch-widedeep/pytorch_widedeep/initializers.py:31: UserWarning: No initializer found for deephead\n",
+      "  warnings.warn(\"No initializer found for {}\".format(name))\n"
     ]
    }
   ],
@@ -1491,7 +1491,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
@@ -1644,7 +1644,7 @@
       ")"
      ]
     },
-     "execution_count": 23,
+     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1655,7 +1655,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
@@ -1677,8 +1677,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [02:08<00:00,  5.15s/it, loss=128]\n",
-      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.05s/it, loss=95.5]\n"
+      "epoch 1: 100%|██████████| 25/25 [02:04<00:00,  4.97s/it, loss=127]\n",
+      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.02s/it, loss=94]  \n"
     ]
    }
   ],
@@ -1696,7 +1696,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
@@ -1721,7 +1721,7 @@
       " 'lr_deephead_0': [0.001, 0.001]}"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }

--- a/examples/06_WarmUp_Model_Components.ipynb
+++ b/examples/06_WarmUp_Model_Components.ipynb
@@ -124,7 +124,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -146,11 +146,11 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 153/153 [00:01<00:00, 133.55it/s, loss=0.469, metrics={'acc': 0.7963}]\n",
-      "epoch 2: 100%|██████████| 153/153 [00:00<00:00, 156.42it/s, loss=0.372, metrics={'acc': 0.8119}]\n",
-      "epoch 3: 100%|██████████| 153/153 [00:01<00:00, 152.65it/s, loss=0.363, metrics={'acc': 0.8183}]\n",
-      "epoch 4: 100%|██████████| 153/153 [00:01<00:00, 144.01it/s, loss=0.36, metrics={'acc': 0.8218}] \n",
-      "epoch 5: 100%|██████████| 153/153 [00:00<00:00, 155.56it/s, loss=0.359, metrics={'acc': 0.8241}]\n",
+      "epoch 1: 100%|██████████| 153/153 [00:01<00:00, 131.88it/s, loss=0.471, metrics={'acc': 0.7946}]\n",
+      "epoch 2: 100%|██████████| 153/153 [00:00<00:00, 154.81it/s, loss=0.373, metrics={'acc': 0.8115}]\n",
+      "epoch 3: 100%|██████████| 153/153 [00:01<00:00, 151.56it/s, loss=0.364, metrics={'acc': 0.8182}]\n",
+      "epoch 4: 100%|██████████| 153/153 [00:00<00:00, 154.22it/s, loss=0.362, metrics={'acc': 0.8216}]\n",
+      "epoch 5: 100%|██████████| 153/153 [00:01<00:00, 152.65it/s, loss=0.36, metrics={'acc': 0.8238}] \n",
      "  0%|          | 0/153 [00:00<?, ?it/s]"
     ]
    },
@@ -165,11 +165,11 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 64.03it/s, loss=0.4, metrics={'acc': 0.8214}]  \n",
-      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 64.30it/s, loss=0.347, metrics={'acc': 0.8237}]\n",
-      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 64.18it/s, loss=0.341, metrics={'acc': 0.8258}]\n",
-      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 64.98it/s, loss=0.337, metrics={'acc': 0.8277}]\n",
-      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 65.70it/s, loss=0.333, metrics={'acc': 0.8294}]\n",
+      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 64.04it/s, loss=0.395, metrics={'acc': 0.8222}]\n",
+      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 65.34it/s, loss=0.349, metrics={'acc': 0.8242}]\n",
+      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 65.05it/s, loss=0.343, metrics={'acc': 0.8262}]\n",
+      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 64.93it/s, loss=0.339, metrics={'acc': 0.8279}]\n",
+      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 65.15it/s, loss=0.335, metrics={'acc': 0.8295}]\n",
      "  0%|          | 0/153 [00:00<?, ?it/s]"
     ]
    },
@@ -184,16 +184,16 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 58.97it/s, loss=0.343, metrics={'acc': 0.8442}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 123.14it/s, loss=0.349, metrics={'acc': 0.8436}]\n",
-      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 59.48it/s, loss=0.333, metrics={'acc': 0.8457}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 114.05it/s, loss=0.346, metrics={'acc': 0.8447}]\n",
-      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 59.82it/s, loss=0.331, metrics={'acc': 0.8471}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 122.31it/s, loss=0.345, metrics={'acc': 0.8457}]\n",
-      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 59.20it/s, loss=0.329, metrics={'acc': 0.8474}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 113.66it/s, loss=0.345, metrics={'acc': 0.8459}]\n",
-      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 59.15it/s, loss=0.328, metrics={'acc': 0.8479}]\n",
-      "valid: 100%|██████████| 39/39 [00:00<00:00, 118.58it/s, loss=0.345, metrics={'acc': 0.8462}]\n"
+      "epoch 1: 100%|██████████| 153/153 [00:02<00:00, 58.31it/s, loss=0.345, metrics={'acc': 0.8415}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 120.77it/s, loss=0.346, metrics={'acc': 0.8416}]\n",
+      "epoch 2: 100%|██████████| 153/153 [00:02<00:00, 58.33it/s, loss=0.335, metrics={'acc': 0.8446}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 117.88it/s, loss=0.344, metrics={'acc': 0.8438}]\n",
+      "epoch 3: 100%|██████████| 153/153 [00:02<00:00, 58.43it/s, loss=0.331, metrics={'acc': 0.8457}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 109.26it/s, loss=0.343, metrics={'acc': 0.8449}]\n",
+      "epoch 4: 100%|██████████| 153/153 [00:02<00:00, 58.08it/s, loss=0.329, metrics={'acc': 0.8457}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 120.23it/s, loss=0.344, metrics={'acc': 0.8446}]\n",
+      "epoch 5: 100%|██████████| 153/153 [00:02<00:00, 58.75it/s, loss=0.327, metrics={'acc': 0.8464}]\n",
+      "valid: 100%|██████████| 39/39 [00:00<00:00, 119.22it/s, loss=0.344, metrics={'acc': 0.8453}]\n"
     ]
    }
   ],
@@ -233,7 +233,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -243,7 +243,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -270,7 +270,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -289,7 +289,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "  8%|▊         | 84/1001 [00:00<00:02, 413.82it/s]"
+      "  3%|▎         | 29/1001 [00:00<00:03, 282.66it/s]"
     ]
    },
    {
@@ -303,7 +303,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 1001/1001 [00:02<00:00, 419.78it/s]\n"
+      "100%|██████████| 1001/1001 [00:03<00:00, 327.44it/s]\n"
     ]
    },
    {
@@ -323,16 +323,16 @@
    "prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)\n",
    "X_deep = prepare_deep.fit_transform(df)\n",
    "\n",
-    "text_processor = TextPreprocessor(word_vectors_path=word_vectors_path)\n",
-    "X_text = text_processor.fit_transform(df, text_col)\n",
+    "text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col)\n",
+    "X_text = text_processor.fit_transform(df)\n",
    "\n",
-    "image_processor = ImagePreprocessor()\n",
-    "X_images = image_processor.fit_transform(df, img_col, img_path)"
+    "image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)\n",
+    "X_images = image_processor.fit_transform(df)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -551,7 +551,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -632,7 +632,7 @@
       " ), Linear(in_features=512, out_features=1, bias=True)]"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -652,7 +652,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -661,7 +661,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@@ -683,7 +683,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [00:00<00:00, 42.89it/s, loss=127]\n",
+      "epoch 1: 100%|██████████| 25/25 [00:00<00:00, 57.98it/s, loss=127]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -698,7 +698,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [00:00<00:00, 43.28it/s, loss=117]\n",
+      "epoch 1: 100%|██████████| 25/25 [00:00<00:00, 45.81it/s, loss=116]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -713,7 +713,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [00:05<00:00,  4.78it/s, loss=132]\n",
+      "epoch 1: 100%|██████████| 25/25 [00:04<00:00,  5.37it/s, loss=132]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -728,7 +728,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [01:11<00:00,  2.86s/it, loss=119]\n",
+      "epoch 1: 100%|██████████| 25/25 [01:10<00:00,  2.83s/it, loss=119]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -743,7 +743,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [01:35<00:00,  3.81s/it, loss=108]\n",
+      "epoch 1: 100%|██████████| 25/25 [01:34<00:00,  3.76s/it, loss=108]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -758,7 +758,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [01:58<00:00,  4.76s/it, loss=105]\n",
+      "epoch 1: 100%|██████████| 25/25 [01:57<00:00,  4.69s/it, loss=106]\n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -773,7 +773,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [02:25<00:00,  5.80s/it, loss=105]\n",
+      "epoch 1: 100%|██████████| 25/25 [02:24<00:00,  5.79s/it, loss=105] \n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -788,7 +788,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [03:04<00:00,  7.38s/it, loss=106] \n",
+      "epoch 1: 100%|██████████| 25/25 [03:01<00:00,  7.26s/it, loss=105] \n",
      "  0%|          | 0/25 [00:00<?, ?it/s]"
     ]
    },
@@ -803,8 +803,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "epoch 1: 100%|██████████| 25/25 [02:05<00:00,  5.00s/it, loss=130]\n",
-      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.04s/it, loss=125] \n"
+      "epoch 1: 100%|██████████| 25/25 [02:05<00:00,  5.03s/it, loss=129]\n",
+      "valid: 100%|██████████| 7/7 [00:14<00:00,  2.11s/it, loss=103] \n"
     ]
    }
   ],

--- a/examples/adult_script.py
+++ b/examples/adult_script.py
@@ -5,42 +5,57 @@ import torch
 from pytorch_widedeep.preprocessing import WidePreprocessor, DeepPreprocessor
 from pytorch_widedeep.models import Wide, DeepDense, WideDeep
 from pytorch_widedeep.optim import RAdam
-from pytorch_widedeep.initializers import *
-from pytorch_widedeep.callbacks import *
-from pytorch_widedeep.metrics import *
+from pytorch_widedeep.initializers import KaimingNormal, XavierNormal
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
+from pytorch_widedeep.metrics import BinaryAccuracy

 use_cuda = torch.cuda.is_available()

-if __name__ == '__main__':
+if __name__ == "__main__":

-    df = pd.read_csv('data/adult/adult.csv.zip')
+    df = pd.read_csv("data/adult/adult.csv.zip")
    df.columns = [c.replace("-", "_") for c in df.columns]
-    df['age_buckets'] = pd.cut(df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9))
-    df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
-    df.drop('income', axis=1, inplace=True)
+    df["age_buckets"] = pd.cut(
+        df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9)
+    )
+    df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
+    df.drop("income", axis=1, inplace=True)
    df.head()

-    wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
-        'native_country','gender']
-    crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
-    cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
-        ('occupation',10),('native_country',10)]
-    continuous_cols = ["age","hours_per_week"]
-    target = 'income_label'
+    wide_cols = [
+        "age_buckets",
+        "education",
+        "relationship",
+        "workclass",
+        "occupation",
+        "native_country",
+        "gender",
+    ]
+    crossed_cols = [("education", "occupation"), ("native_country", "occupation")]
+    cat_embed_cols = [
+        ("education", 10),
+        ("relationship", 8),
+        ("workclass", 10),
+        ("occupation", 10),
+        ("native_country", 10),
+    ]
+    continuous_cols = ["age", "hours_per_week"]
+    target = "income_label"
    target = df[target].values
    prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)
-    prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
+    prepare_deep = DeepPreprocessor(
+        embed_cols=cat_embed_cols, continuous_cols=continuous_cols
+    )
    X_deep = prepare_deep.fit_transform(df)
-    wide = Wide(
-        wide_dim=X_wide.shape[1],
-        output_dim=1)
+    wide = Wide(wide_dim=X_wide.shape[1], output_dim=1)
    deepdense = DeepDense(
-        hidden_layers=[64,32],
-        dropout=[0.2,0.2],
+        hidden_layers=[64, 32],
+        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
-        continuous_cols=continuous_cols)
+        continuous_cols=continuous_cols,
+    )
    model = WideDeep(wide=wide, deepdense=deepdense)

    wide_opt = torch.optim.Adam(model.wide.parameters())
@@ -48,19 +63,24 @@ if __name__ == '__main__':
    wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=3)
    deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=5)

-    optimizers = {'wide': wide_opt, 'deepdense':deep_opt}
-    schedulers = {'wide': wide_sch, 'deepdense':deep_sch}
-    initializers = {'wide': KaimingNormal, 'deepdense':XavierNormal}
-    callbacks = [LRHistory(n_epochs=10), EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out')]
+    optimizers = {"wide": wide_opt, "deepdense": deep_opt}
+    schedulers = {"wide": wide_sch, "deepdense": deep_sch}
+    initializers = {"wide": KaimingNormal, "deepdense": XavierNormal}
+    callbacks = [
+        LRHistory(n_epochs=10),
+        EarlyStopping,
+        ModelCheckpoint(filepath="model_weights/wd_out"),
+    ]
    metrics = [BinaryAccuracy]

    model.compile(
-        method='binary',
+        method="binary",
        optimizers=optimizers,
        lr_schedulers=schedulers,
        initializers=initializers,
        callbacks=callbacks,
-        metrics=metrics)
+        metrics=metrics,
+    )

    model.fit(
        X_wide=X_wide,
@@ -68,4 +88,5 @@ if __name__ == '__main__':
        target=target,
        n_epochs=10,
        batch_size=256,
-        val_split=0.2)
+        val_split=0.2,
+    )
--- a/examples/airbnb_data_preprocessing.py
+++ b/examples/airbnb_data_preprocessing.py
@@ -3,32 +3,59 @@ import numpy as np
 import pandas as pd
 import warnings
 import os
+import gender_guesser.detector as gender

+from sklearn.preprocessing import MultiLabelBinarizer
+from functools import reduce
+from itertools import chain
+from collections import Counter
 from pathlib import Path

-warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore")

-DATA_PATH = Path('data/airbnb')
-fname = 'listings.csv.gz'
+DATA_PATH = Path("data/airbnb")
+fname = "listings.csv.gz"
 if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

-df_original = pd.read_csv(DATA_PATH/fname)
+df_original = pd.read_csv(DATA_PATH / fname)
 print(df_original.shape)
 df_original.head()

 # this is just subjective. One can choose some other columns
-keep_cols = ['id', 'host_id', 'description', 'house_rules', 'host_name',
-    'host_listings_count', 'host_identity_verified', 'neighbourhood_cleansed',
-    'latitude', 'longitude','is_location_exact','property_type', 'room_type',
-    'accommodates', 'bathrooms', 'bedrooms','beds', 'amenities', 'price',
-    'security_deposit', 'cleaning_fee','guests_included', 'extra_people',
-    'minimum_nights','instant_bookable','cancellation_policy',
-    'reviews_per_month']
+keep_cols = [
+    "id",
+    "host_id",
+    "description",
+    "house_rules",
+    "host_name",
+    "host_listings_count",
+    "host_identity_verified",
+    "neighbourhood_cleansed",
+    "latitude",
+    "longitude",
+    "is_location_exact",
+    "property_type",
+    "room_type",
+    "accommodates",
+    "bathrooms",
+    "bedrooms",
+    "beds",
+    "amenities",
+    "price",
+    "security_deposit",
+    "cleaning_fee",
+    "guests_included",
+    "extra_people",
+    "minimum_nights",
+    "instant_bookable",
+    "cancellation_policy",
+    "reviews_per_month",
+]

 df = df_original[keep_cols]
 df = df[~df.reviews_per_month.isna()]
-df = df[~df.description.isna() ]
+df = df[~df.description.isna()]
 df = df[~df.host_listings_count.isna()]
 print(df.shape)

@@ -39,25 +66,23 @@ print(df.shape)
 #
 # I will simply include a binary column with 1/0 if the property has/has not
 # house rules.
-df['has_house_rules'] = df['house_rules']
+df["has_house_rules"] = df["house_rules"]
 df.has_house_rules.fillna(0, inplace=True)
-df['has_house_rules'][df.has_house_rules!=0] = 1
-df.drop('house_rules', axis=1, inplace=True)
+df["has_house_rules"][df.has_house_rules != 0] = 1
+df.drop("house_rules", axis=1, inplace=True)

 # host_name
 #
 # I will use names to infer gender using `gender_guesser`
-import gender_guesser.detector as gender
-from collections import Counter

 host_name = df.host_name.tolist()
 d = gender.Detector()
 host_gender = [d.get_gender(n) for n in host_name]
-replace_dict = {'mostly_male': 'male', 'mostly_female': 'female', 'andy': 'unknown'}
-host_gender = [replace_dict.get(item,item) for item in host_gender]
+replace_dict = {"mostly_male": "male", "mostly_female": "female", "andy": "unknown"}
+host_gender = [replace_dict.get(item, item) for item in host_gender]
 Counter(host_gender)
-df['host_gender'] = host_gender
-df.drop('host_name', axis=1, inplace=True)
+df["host_gender"] = host_gender
+df.drop("host_name", axis=1, inplace=True)
 df.head()

 # property_type, room_type, accommodates, bathrooms, bedrooms, beds and
@@ -65,45 +90,58 @@ df.head()
 #
 # Here some standard pre-processing
 df.property_type.value_counts()
-replace_prop_type = [val for val in df.property_type.unique().tolist() if val not in ['Apartment', 'House']]
-replace_prop_type = {k:'other' for k in replace_prop_type}
+replace_prop_type = [
+    val
+    for val in df.property_type.unique().tolist()
+    if val not in ["Apartment", "House"]
+]
+replace_prop_type = {k: "other" for k in replace_prop_type}
 df.property_type.replace(replace_prop_type, inplace=True)
-df['property_type'] = df.property_type.apply(lambda x: '_'.join(x.split(' ')).lower())
+df["property_type"] = df.property_type.apply(lambda x: "_".join(x.split(" ")).lower())

 df.room_type.value_counts()
-df['room_type'] = df.room_type.apply(lambda x: '_'.join(x.split(' ')).lower())
+df["room_type"] = df.room_type.apply(lambda x: "_".join(x.split(" ")).lower())

-df['bathrooms'][(df.bathrooms.isna()) & (df.room_type == 'private_room')] = 0
-df['bathrooms'][(df.bathrooms.isna()) & (df.room_type == 'entire_home/apt')] = 1
+df["bathrooms"][(df.bathrooms.isna()) & (df.room_type == "private_room")] = 0
+df["bathrooms"][(df.bathrooms.isna()) & (df.room_type == "entire_home/apt")] = 1
 df.bedrooms.fillna(1, inplace=True)
 df.beds.fillna(1, inplace=True)

 # Encode some as categorical
-categorical_cut = [('accommodates', 3),
-    ('guests_included', 3),
-    ('minimum_nights', 3),
-    ('host_listings_count', 3),
-    ('bathrooms', 1.5),
-    ('bedrooms', 3),
-    ('beds', 3)
-    ]
+categorical_cut = [
+    ("accommodates", 3),
+    ("guests_included", 3),
+    ("minimum_nights", 3),
+    ("host_listings_count", 3),
+    ("bathrooms", 1.5),
+    ("bedrooms", 3),
+    ("beds", 3),
+]

 for col, cut in categorical_cut:
-    new_colname = col + '_catg'
-    df[new_colname] = df[col].apply(lambda x: cut if x >=cut else x)
+    new_colname = col + "_catg"
+    df[new_colname] = df[col].apply(lambda x: cut if x >= cut else x)
    df[new_colname] = df[new_colname].round().astype(int)

 # Amenities
 #
 # I will just add a number of dummy columns with 1/0 if the property has/has
 # not that particular amenity
-from functools import reduce
-from itertools import chain
-
-amenity_repls = (("\"", ""), ("{", ""), ("}", ""),(" / ", "_"), ("/", "_"), (" ", "_"), ("(s)", ""))
+amenity_repls = (
+    ('"', ""),
+    ("{", ""),
+    ("}", ""),
+    (" / ", "_"),
+    ("/", "_"),
+    (" ", "_"),
+    ("(s)", ""),
+)

 amenities_raw = df.amenities.str.lower().tolist()
-amenities = [reduce(lambda a, kv: a.replace(*kv), amenity_repls, s).split(",") for s in amenities_raw]
+amenities = [
+    reduce(lambda a, kv: a.replace(*kv), amenity_repls, s).split(",")
+    for s in amenities_raw
+]

 all_amenities = list(chain(*amenities))
 all_amenities_count = Counter(all_amenities)
@@ -112,71 +150,81 @@ all_amenities_count
 # having a look to the list we see that one amenity is empty and two are
 # "translation missing:..."
 keep_amenities = []
-for k,v in all_amenities_count.items():
-    if k and 'missing' not in k:
+for k, v in all_amenities_count.items():
+    if k and "missing" not in k:
        keep_amenities.append(k)

-final_amenities = [[amenity for amenity in house_amenities if amenity in keep_amenities]
-    for house_amenities in amenities]
+final_amenities = [
+    [amenity for amenity in house_amenities if amenity in keep_amenities]
+    for house_amenities in amenities
+]

 # some properties have no amenities aparently
-final_amenities = [['no amenities'] if not amenity else amenity for amenity in final_amenities]
-final_amenities = [['amenity_'+ amenity for amenity in amenities] for amenities in final_amenities]
+final_amenities = [
+    ["no amenities"] if not amenity else amenity for amenity in final_amenities
+]
+final_amenities = [
+    ["amenity_" + amenity for amenity in amenities] for amenities in final_amenities
+]

 # let's build the dummy df
-from sklearn.preprocessing import MultiLabelBinarizer
-
-df_list_of_amenities = pd.DataFrame(
-    {'groups': final_amenities
-    }, columns=['groups'])
-s = df_list_of_amenities['groups']
+df_list_of_amenities = pd.DataFrame({"groups": final_amenities}, columns=["groups"])
+s = df_list_of_amenities["groups"]

 mlb = MultiLabelBinarizer()

 df_amenities = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=df.index)

-df.drop('amenities', axis=1, inplace=True)
+df.drop("amenities", axis=1, inplace=True)
 df = pd.concat([df, df_amenities], axis=1)
 df.head()

 # Price, security_deposit, cleaning_fee, extra_people

-money_columns = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']
-tmp_money_df = df[money_columns].fillna('$0')
+money_columns = ["price", "security_deposit", "cleaning_fee", "extra_people"]
+tmp_money_df = df[money_columns].fillna("$0")

 money_repls = (("$", ""), (",", ""))
 for col in money_columns:
    val_str = tmp_money_df[col].tolist()
-    val_num = [float(st) for st in [reduce(lambda a, kv: a.replace(*kv), money_repls, s) for s in val_str]]
+    val_num = [
+        float(st)
+        for st in [
+            reduce(lambda a, kv: a.replace(*kv), money_repls, s) for s in val_str
+        ]
+    ]
    tmp_money_df[col] = val_num

-high_price, high_deposit, high_cleaning_fee, high_extra_people = 1000,2000,200,100
+high_price, high_deposit, high_cleaning_fee, high_extra_people = 1000, 2000, 200, 100

 high_price_count = (tmp_money_df.price >= high_price).sum()
 high_deposit_count = (tmp_money_df.security_deposit >= high_deposit).sum()
 high_cleaning_fee_count = (tmp_money_df.cleaning_fee >= high_cleaning_fee).sum()
 high_extra_people_count = (tmp_money_df.extra_people >= high_extra_people).sum()

-print('properties with very high price: {}'.format(high_price_count))
-print('properties with very high security deposit: {}'.format(high_deposit_count))
-print('properties with very high cleaning fee: {}'.format(high_cleaning_fee_count))
-print('properties with very high extra people cost: {}'.format(high_extra_people_count))
+print("properties with very high price: {}".format(high_price_count))
+print("properties with very high security deposit: {}".format(high_deposit_count))
+print("properties with very high cleaning fee: {}".format(high_cleaning_fee_count))
+print("properties with very high extra people cost: {}".format(high_extra_people_count))

 # We will now just concat and we will drop high values later one
 df.drop(money_columns, axis=1, inplace=True)
 df = pd.concat([df, tmp_money_df], axis=1)
-df = df[(df.price<high_price) &
-        (df.price!=0) &
-        (df.security_deposit<high_deposit) &
-        (df.cleaning_fee<high_cleaning_fee) &
-        (df.extra_people<high_extra_people)]
+df = df[
+    (df.price < high_price)
+    & (df.price != 0)
+    & (df.security_deposit < high_deposit)
+    & (df.cleaning_fee < high_cleaning_fee)
+    & (df.extra_people < high_extra_people)
+]
 df.head()
 print(df.shape)

 # let's make sure there are no nan left
 has_nan = df.isnull().any(axis=0)
 has_nan = [df.columns[i] for i in np.where(has_nan)[0]]
-if not has_nan: print('no NaN, all OK')
+if not has_nan:
+    print("no NaN, all OK")

 # Computing a proxi for yield

@@ -192,11 +240,10 @@ if not has_nan: print('no NaN, all OK')
 # Francisco model assumptions simply multiply my yield by 6 (3 * (1/0.5)) or
 # by 72 (3 * 2 * 12) if you prefer per year.

-df['yield'] = (df['price']+df['cleaning_fee']) * (df['reviews_per_month'])
-df.drop(['price','cleaning_fee','reviews_per_month'], axis=1, inplace=True)
+df["yield"] = (df["price"] + df["cleaning_fee"]) * (df["reviews_per_month"])
+df.drop(["price", "cleaning_fee", "reviews_per_month"], axis=1, inplace=True)
 # we will focus in cases with yield below 600 (we lose ~3% of the data).
 # No real reason for this, simply removing some "outliers"
-df = df[df['yield'] <= 600]
-df.to_csv(DATA_PATH/'listings_processed.csv', index=False)
+df = df[df["yield"] <= 600]
+df.to_csv(DATA_PATH / "listings_processed.csv", index=False)
 print("data preprocessed finished. Final shape: {}".format(df.shape))
-
--- a/examples/airbnb_script.py
+++ b/examples/airbnb_script.py
-import numpy as np
 import pandas as pd
-import os
 import torch

-from pytorch_widedeep.preprocessing import (WidePreprocessor, DeepPreprocessor,
-    TextPreprocessor, ImagePreprocessor)
-from pytorch_widedeep.models import (Wide, DeepDense, DeepText, DeepImage,
-    WideDeep)
-from pytorch_widedeep.initializers import *
-from pytorch_widedeep.callbacks import *
+from torchvision.transforms import ToTensor, Normalize
+
+from pytorch_widedeep.preprocessing import (
+    WidePreprocessor,
+    DeepPreprocessor,
+    TextPreprocessor,
+    ImagePreprocessor,
+)
+from pytorch_widedeep.models import Wide, DeepDense, DeepText, DeepImage, WideDeep
+from pytorch_widedeep.initializers import KaimingNormal
+from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.optim import RAdam

-use_cuda = torch.cuda.is_available()
-
-if __name__ == '__main__':

-    df = pd.read_csv('data/airbnb/airbnb_sample.csv')
+use_cuda = torch.cuda.is_available()

-    crossed_cols = (['property_type', 'room_type'],)
-    already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
-    wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
-    'instant_bookable'] + already_dummies
-    cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
-        [('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
-    continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
-    already_standard = ['latitude', 'longitude']
-    text_col = 'description'
-    word_vectors_path = 'data/glove.6B/glove.6B.100d.txt'
-    img_col = 'id'
-    img_path = 'data/airbnb/property_picture'
-    target = 'yield'
+if __name__ == "__main__":
+
+    df = pd.read_csv("data/airbnb/airbnb_sample.csv")
+
+    crossed_cols = (["property_type", "room_type"],)
+    already_dummies = [c for c in df.columns if "amenity" in c] + ["has_house_rules"]
+    wide_cols = [
+        "is_location_exact",
+        "property_type",
+        "room_type",
+        "host_gender",
+        "instant_bookable",
+    ] + already_dummies
+    cat_embed_cols = [(c, 16) for c in df.columns if "catg" in c] + [
+        ("neighbourhood_cleansed", 64),
+        ("cancellation_policy", 16),
+    ]
+    continuous_cols = ["latitude", "longitude", "security_deposit", "extra_people"]
+    already_standard = ["latitude", "longitude"]
+    text_col = "description"
+    word_vectors_path = "data/glove.6B/glove.6B.100d.txt"
+    img_col = "id"
+    img_path = "data/airbnb/property_picture"
+    target = "yield"

    target = df[target].values

    prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)

-    prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
+    prepare_deep = DeepPreprocessor(
+        embed_cols=cat_embed_cols, continuous_cols=continuous_cols
+    )
    X_deep = prepare_deep.fit_transform(df)

-    text_processor = TextPreprocessor(word_vectors_path=word_vectors_path)
-    X_text = text_processor.fit_transform(df, text_col)
+    text_processor = TextPreprocessor(
+        word_vectors_path=word_vectors_path, text_col=text_col
+    )
+    X_text = text_processor.fit_transform(df)

-    image_processor = ImagePreprocessor()
-    X_images = image_processor.fit_transform(df, img_col, img_path)
+    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
+    X_images = image_processor.fit_transform(df)

-    wide = Wide(
-        wide_dim=X_wide.shape[1],
-        output_dim=1)
+    wide = Wide(wide_dim=X_wide.shape[1], output_dim=1)
    deepdense = DeepDense(
-        hidden_layers=[64,32],
-        dropout=[0.2,0.2],
+        hidden_layers=[64, 32],
+        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
-        continuous_cols=continuous_cols)
+        continuous_cols=continuous_cols,
+    )
    deeptext = DeepText(
        vocab_size=len(text_processor.vocab.itos),
        hidden_dim=64,
        n_layers=3,
        rnn_dropout=0.5,
        padding_idx=1,
-        embedding_matrix=text_processor.embedding_matrix
-        )
+        embedding_matrix=text_processor.embedding_matrix,
+    )
    deepimage = DeepImage(pretrained=True, head_layers=None)
-    model = WideDeep(wide=wide, deepdense=deepdense, deeptext=deeptext,
-        deepimage=deepimage)
+    model = WideDeep(
+        wide=wide, deepdense=deepdense, deeptext=deeptext, deepimage=deepimage
+    )

    wide_opt = torch.optim.Adam(model.wide.parameters())
    deep_opt = torch.optim.Adam(model.deepdense.parameters())
    text_opt = RAdam(model.deeptext.parameters())
-    img_opt  = RAdam(model.deepimage.parameters())
+    img_opt = RAdam(model.deepimage.parameters())

    wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)
    deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=3)
    text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)
-    img_sch  = torch.optim.lr_scheduler.StepLR(img_opt, step_size=3)
-
-    optimizers = {'wide': wide_opt, 'deepdense':deep_opt, 'deeptext':text_opt, 'deepimage': img_opt}
-    schedulers = {'wide': wide_sch, 'deepdense':deep_sch, 'deeptext':text_sch, 'deepimage': img_sch}
-    initializers = {'wide': KaimingNormal, 'deepdense':KaimingNormal, 'deeptext':KaimingNormal, 'deepimage':KaimingNormal}
-    mean = [0.406, 0.456, 0.485]  #BGR
-    std =  [0.225, 0.224, 0.229]  #BGR
+    img_sch = torch.optim.lr_scheduler.StepLR(img_opt, step_size=3)
+
+    optimizers = {
+        "wide": wide_opt,
+        "deepdense": deep_opt,
+        "deeptext": text_opt,
+        "deepimage": img_opt,
+    }
+    schedulers = {
+        "wide": wide_sch,
+        "deepdense": deep_sch,
+        "deeptext": text_sch,
+        "deepimage": img_sch,
+    }
+    initializers = {
+        "wide": KaimingNormal,
+        "deepdense": KaimingNormal,
+        "deeptext": KaimingNormal,
+        "deepimage": KaimingNormal,
+    }
+    mean = [0.406, 0.456, 0.485]  # BGR
+    std = [0.225, 0.224, 0.229]  # BGR
    transforms = [ToTensor, Normalize(mean=mean, std=std)]
-    callbacks = [EarlyStopping, ModelCheckpoint(filepath='model_weights/wd_out.pt')]
-
-    model.compile(method='regression', initializers=initializers, optimizers=optimizers,
-        lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)
-
-    model.fit(X_wide=X_wide, X_deep=X_deep, X_text=X_text, X_img=X_images,
-        target=target, n_epochs=1, batch_size=32, val_split=0.2)
-
-    # With warm_up
+    callbacks = [EarlyStopping, ModelCheckpoint(filepath="model_weights/wd_out.pt")]
+
+    model.compile(
+        method="regression",
+        initializers=initializers,
+        optimizers=optimizers,
+        lr_schedulers=schedulers,
+        callbacks=callbacks,
+        transforms=transforms,
+    )
+
+    model.fit(
+        X_wide=X_wide,
+        X_deep=X_deep,
+        X_text=X_text,
+        X_img=X_images,
+        target=target,
+        n_epochs=1,
+        batch_size=32,
+        val_split=0.2,
+    )
+
+    # # With warm_up
    # child = list(model.deepimage.children())[0]
    # img_layers = list(child.backbone.children())[4:8] + [list(model.deepimage.children())[1]]
    # img_layers = img_layers[::-1]
@@ -98,4 +142,4 @@ if __name__ == '__main__':
    # model.fit(X_wide=X_wide, X_deep=X_deep, X_text=X_text, X_img=X_images,
    #     target=target, n_epochs=1, batch_size=32, val_split=0.2, warm_up=True,
    #     warm_epochs=1, warm_deepimage_gradual=True, warm_deepimage_layers=img_layers,
-    #     warm_deepimage_max_lr=0.01, warm_routine='howard')
\ No newline at end of file
+    #     warm_deepimage_max_lr=0.01, warm_routine='howard')
--- a/examples/airbnb_script_multiclass.py
+++ b/examples/airbnb_script_multiclass.py
@@ -4,51 +4,58 @@ import torch

 from pytorch_widedeep.preprocessing import WidePreprocessor, DeepPreprocessor
 from pytorch_widedeep.models import Wide, DeepDense, WideDeep
-from pytorch_widedeep.optim import RAdam
-from pytorch_widedeep.initializers import *
-from pytorch_widedeep.callbacks import *
-from pytorch_widedeep.metrics import *

-import pdb
+from pytorch_widedeep.metrics import CategoricalAccuracy

 use_cuda = torch.cuda.is_available()

-if __name__ == '__main__':
+if __name__ == "__main__":

-    df = pd.read_csv('data/airbnb/airbnb_sample.csv')
+    df = pd.read_csv("data/airbnb/airbnb_sample.csv")

-    crossed_cols = (['property_type', 'room_type'],)
-    already_dummies = [c for c in df.columns if 'amenity' in c] + ['has_house_rules']
-    wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender',
-    'instant_bookable'] + already_dummies
-    cat_embed_cols = [(c, 16) for c in df.columns if 'catg' in c] + \
-        [('neighbourhood_cleansed', 64), ('cancellation_policy', 16)]
-    continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
-    already_standard = ['latitude', 'longitude']
-    df['yield_cat'] = pd.cut(df['yield'], bins=[0.2, 65, 163, 600], labels=[0,1,2])
-    df.drop('yield', axis=1, inplace=True)
-    target = 'yield_cat'
+    crossed_cols = (["property_type", "room_type"],)
+    already_dummies = [c for c in df.columns if "amenity" in c] + ["has_house_rules"]
+    wide_cols = [
+        "is_location_exact",
+        "property_type",
+        "room_type",
+        "host_gender",
+        "instant_bookable",
+    ] + already_dummies
+    cat_embed_cols = [(c, 16) for c in df.columns if "catg" in c] + [
+        ("neighbourhood_cleansed", 64),
+        ("cancellation_policy", 16),
+    ]
+    continuous_cols = ["latitude", "longitude", "security_deposit", "extra_people"]
+    already_standard = ["latitude", "longitude"]
+    df["yield_cat"] = pd.cut(df["yield"], bins=[0.2, 65, 163, 600], labels=[0, 1, 2])
+    df.drop("yield", axis=1, inplace=True)
+    target = "yield_cat"

    target = np.array(df[target].values)
    prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)
-    prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
+
+    prepare_deep = DeepPreprocessor(
+        embed_cols=cat_embed_cols, continuous_cols=continuous_cols
+    )
    X_deep = prepare_deep.fit_transform(df)
-    wide = Wide(
-        wide_dim=X_wide.shape[1],
-        output_dim=3)
+    wide = Wide(wide_dim=X_wide.shape[1], output_dim=3)
    deepdense = DeepDense(
-        hidden_layers=[64,32],
-        dropout=[0.2,0.2],
+        hidden_layers=[64, 32],
+        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
-        continuous_cols=continuous_cols)
+        continuous_cols=continuous_cols,
+    )
    model = WideDeep(wide=wide, deepdense=deepdense, output_dim=3)
-    model.compile(method='multiclass', metrics=[CategoricalAccuracy])
+    model.compile(method="multiclass", metrics=[CategoricalAccuracy])
+
    model.fit(
        X_wide=X_wide,
        X_deep=X_deep,
        target=target,
        n_epochs=1,
        batch_size=32,
-        val_split=0.2)
\ No newline at end of file
+        val_split=0.2,
+    )
\ No newline at end of file
--- a/examples/download_images.py
+++ b/examples/download_images.py
@@ -7,42 +7,44 @@ from urllib.request import urlretrieve
 from pathlib import Path


-def download_images(df, out_path, id_col,  img_col):
-	download_error = []
-	counter = 0
-	for idx,row in tqdm(df.iterrows(), total=df.shape[0]):
-		if counter <1000:
-			img_path = str(out_path/'.'.join([str(row[id_col]), 'jpg']))
-			if os.path.isfile(img_path):
-				continue
-			else:
-				try:
-					urlretrieve(row[img_col], img_path)
-					counter+=1
-				except:
-					# print("Error downloading host image {}".format(row[id_col]))
-					download_error.append(row[id_col])
-					pass
-	pickle.dump(download_error, open(DATA_PATH/(id_col+'_download_error.p'), 'wb'))
-
-
-if __name__ == '__main__':
-
-	DATA_PATH=Path('data/airbnb')
-	HOST_PATH = DATA_PATH/'host_picture'
-	PROP_PATH = DATA_PATH/'property_picture'
-
-	if not os.path.exists(DATA_PATH):
-		os.makedirs(DATA_PATH)
-	if not os.path.exists(HOST_PATH):
-		os.makedirs(HOST_PATH)
-	if not os.path.exists(PROP_PATH):
-		os.makedirs(PROP_PATH)
-
-	df_original = pd.read_csv(DATA_PATH/'listings.csv.gz')[['id', 'host_id', 'picture_url', 'host_picture_url']]
-	df_processed = pd.read_csv(DATA_PATH/'listings_processed.csv')[['id', 'host_id']]
-	df = df_processed.merge(df_original, on=['id', 'host_id'])
-
-	df_host = df.groupby('host_id').first().reset_index()
-	download_images(df_host, HOST_PATH, id_col='host_id', img_col='host_picture_url')
-	download_images(df, PROP_PATH, id_col='id', img_col='picture_url')
+def download_images(df, out_path, id_col, img_col):
+    download_error = []
+    counter = 0
+    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+        if counter < 1000:
+            img_path = str(out_path / ".".join([str(row[id_col]), "jpg"]))
+            if os.path.isfile(img_path):
+                continue
+            else:
+                try:
+                    urlretrieve(row[img_col], img_path)
+                    counter += 1
+                except:
+                    # print("Error downloading host image {}".format(row[id_col]))
+                    download_error.append(row[id_col])
+                    pass
+    pickle.dump(download_error, open(DATA_PATH / (id_col + "_download_error.p"), "wb"))
+
+
+if __name__ == "__main__":
+
+    DATA_PATH = Path("data/airbnb")
+    HOST_PATH = DATA_PATH / "host_picture"
+    PROP_PATH = DATA_PATH / "property_picture"
+
+    if not os.path.exists(DATA_PATH):
+        os.makedirs(DATA_PATH)
+    if not os.path.exists(HOST_PATH):
+        os.makedirs(HOST_PATH)
+    if not os.path.exists(PROP_PATH):
+        os.makedirs(PROP_PATH)
+
+    df_original = pd.read_csv(DATA_PATH / "listings.csv.gz")[
+        ["id", "host_id", "picture_url", "host_picture_url"]
+    ]
+    df_processed = pd.read_csv(DATA_PATH / "listings_processed.csv")[["id", "host_id"]]
+    df = df_processed.merge(df_original, on=["id", "host_id"])
+
+    df_host = df.groupby("host_id").first().reset_index()
+    download_images(df_host, HOST_PATH, id_col="host_id", img_col="host_picture_url")
+    download_images(df, PROP_PATH, id_col="id", img_col="picture_url")
--- a/pytorch_widedeep/__init__.py
+++ b/pytorch_widedeep/__init__.py
@@ -11,4 +11,4 @@ from pytorch_widedeep.version import __version__
 from .utils import dense_utils
 from .utils import text_utils
 from .utils import fastai_transforms
-from .utils import image_utils
\ No newline at end of file
+from .utils import image_utils
--- a/pytorch_widedeep/callbacks.py
+++ b/pytorch_widedeep/callbacks.py
--- a/pytorch_widedeep/initializers.py
+++ b/pytorch_widedeep/initializers.py
-import torch
 import re
 import warnings

@@ -7,180 +6,181 @@ from .wdtypes import *


 class Initializer(object):
-
-    def __call__(self, model:nn.Module):
-        raise NotImplementedError('Initializer must implement this method')
+    def __call__(self, model: nn.Module):
+        raise NotImplementedError("Initializer must implement this method")


 class MultipleInitializer(object):
-
-	def __init__(self, initializers:Dict[str, Initializer], verbose=True):
-
-		self.verbose=verbose
-		instantiated_initializers = {}
-		for model_name, initializer in initializers.items():
-			if isinstance(initializer, type):
-				instantiated_initializers[model_name] = initializer()
-			else: instantiated_initializers[model_name] = initializer
-		self._initializers = instantiated_initializers
-
-	def apply(self, model:nn.Module):
-		for name, child in model.named_children():
-			try:
-				self._initializers[name](child)
-			except:
-				if self.verbose: warnings.warn("No initializer found for {}".format(name))
+    def __init__(self, initializers: Dict[str, Initializer], verbose=True):
+
+        self.verbose = verbose
+        instantiated_initializers = {}
+        for model_name, initializer in initializers.items():
+            if isinstance(initializer, type):
+                instantiated_initializers[model_name] = initializer()
+            else:
+                instantiated_initializers[model_name] = initializer
+        self._initializers = instantiated_initializers
+
+    def apply(self, model: nn.Module):
+        for name, child in model.named_children():
+            try:
+                self._initializers[name](child)
+            except:
+                if self.verbose:
+                    warnings.warn("No initializer found for {}".format(name))


 class Normal(Initializer):
-
-	def __init__(self, mean=0.0, std=1.0, bias=False, pattern='.'):
-	    self.mean = mean
-	    self.std = std
-	    self.bias = bias
-	    self.pattern = pattern
-	    super(Normal, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if self.bias and ('bias' in n):
-					nn.init.normal_(p, mean=self.mean, std=self.std)
-				elif 'bias' in n:
-					pass
-				elif p.requires_grad:
-					nn.init.normal_(p, mean=self.mean, std=self.std)
+    def __init__(self, mean=0.0, std=1.0, bias=False, pattern="."):
+        self.mean = mean
+        self.std = std
+        self.bias = bias
+        self.pattern = pattern
+        super(Normal, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if self.bias and ("bias" in n):
+                    nn.init.normal_(p, mean=self.mean, std=self.std)
+                elif "bias" in n:
+                    pass
+                elif p.requires_grad:
+                    nn.init.normal_(p, mean=self.mean, std=self.std)


 class Uniform(Initializer):
-
-	def __init__(self, a=0, b=1, bias=False, pattern='.'):
-	    self.a = a
-	    self.b = b
-	    self.bias = bias
-	    self.pattern = pattern
-	    super(Uniform, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if self.bias and ('bias' in n):
-					nn.init.uniform_(p, a=self.a, b=self.b)
-				elif 'bias' in n:
-					pass
-				elif p.requires_grad:
-					nn.init.uniform_(p, a=self.a, b=self.b)
+    def __init__(self, a=0, b=1, bias=False, pattern="."):
+        self.a = a
+        self.b = b
+        self.bias = bias
+        self.pattern = pattern
+        super(Uniform, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if self.bias and ("bias" in n):
+                    nn.init.uniform_(p, a=self.a, b=self.b)
+                elif "bias" in n:
+                    pass
+                elif p.requires_grad:
+                    nn.init.uniform_(p, a=self.a, b=self.b)


 class ConstantInitializer(Initializer):
+    def __init__(self, value, bias=False, pattern="."):

-	def __init__(self, value, bias=False, pattern='.'):
-
-	    self.bias = bias
-	    self.value = value
-	    self.pattern = pattern
-	    super(ConstantInitializer, self).__init__()
+        self.bias = bias
+        self.value = value
+        self.pattern = pattern
+        super(ConstantInitializer, self).__init__()

-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if self.bias and ('bias' in n):
-					nn.init.constant_(p, val=self.value)
-				elif ('bias' in n):
-					pass
-				elif p.requires_grad:
-					nn.init.constant_(p, val=self.value)
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if self.bias and ("bias" in n):
+                    nn.init.constant_(p, val=self.value)
+                elif "bias" in n:
+                    pass
+                elif p.requires_grad:
+                    nn.init.constant_(p, val=self.value)


 class XavierUniform(Initializer):
-
-	def __init__(self, gain=1, pattern='.'):
-	    self.gain = gain
-	    self.pattern = pattern
-	    super(XavierUniform, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if 'bias' in n:
-					nn.init.constant_(p, val=0)
-				elif p.requires_grad:
-					try:
-						nn.init.xavier_uniform_(p, gain=self.gain)
-					except: pass
+    def __init__(self, gain=1, pattern="."):
+        self.gain = gain
+        self.pattern = pattern
+        super(XavierUniform, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if "bias" in n:
+                    nn.init.constant_(p, val=0)
+                elif p.requires_grad:
+                    try:
+                        nn.init.xavier_uniform_(p, gain=self.gain)
+                    except:
+                        pass


 class XavierNormal(Initializer):
-
-	def __init__(self, gain=1, pattern='.'):
-	    self.gain = gain
-	    self.pattern = pattern
-	    super(XavierNormal, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if 'bias' in n:
-					nn.init.constant_(p, val=0)
-				elif p.requires_grad:
-					try:
-						nn.init.xavier_normal_(p, gain=self.gain)
-					except: pass
+    def __init__(self, gain=1, pattern="."):
+        self.gain = gain
+        self.pattern = pattern
+        super(XavierNormal, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if "bias" in n:
+                    nn.init.constant_(p, val=0)
+                elif p.requires_grad:
+                    try:
+                        nn.init.xavier_normal_(p, gain=self.gain)
+                    except:
+                        pass


 class KaimingUniform(Initializer):
-
-	def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu', pattern='.'):
-		self.a = a
-		self.mode = mode
-		self.nonlinearity = nonlinearity
-		self.pattern = pattern
-		super(KaimingUniform, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if 'bias' in n:
-					nn.init.constant_(p, val=0)
-				elif p.requires_grad:
-					try:
-						nn.init.kaiming_normal_(p, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity)
-					except: pass
+    def __init__(self, a=0, mode="fan_in", nonlinearity="leaky_relu", pattern="."):
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.pattern = pattern
+        super(KaimingUniform, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if "bias" in n:
+                    nn.init.constant_(p, val=0)
+                elif p.requires_grad:
+                    try:
+                        nn.init.kaiming_normal_(
+                            p, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity
+                        )
+                    except:
+                        pass


 class KaimingNormal(Initializer):
-
-	def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu', pattern='.'):
-		self.a = a
-		self.mode = mode
-		self.nonlinearity = nonlinearity
-		self.pattern = pattern
-		super(KaimingNormal, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if 'bias' in n:
-					nn.init.constant_(p, val=0)
-				elif p.requires_grad:
-					try:
-						nn.init.kaiming_normal_(p, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity)
-					except: pass
+    def __init__(self, a=0, mode="fan_in", nonlinearity="leaky_relu", pattern="."):
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.pattern = pattern
+        super(KaimingNormal, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if "bias" in n:
+                    nn.init.constant_(p, val=0)
+                elif p.requires_grad:
+                    try:
+                        nn.init.kaiming_normal_(
+                            p, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity
+                        )
+                    except:
+                        pass


 class Orthogonal(Initializer):
-
-	def __init__(self, gain=1, pattern='.'):
-	    self.gain = gain
-	    self.pattern = pattern
-	    super(Orthogonal, self).__init__()
-
-	def __call__(self, submodel:nn.Module):
-		for n,p in submodel.named_parameters():
-			if re.search(self.pattern, n):
-				if 'bias' in n:
-					nn.init.constant_(p, val=0)
-				elif p.requires_grad:
-					try:
-						nn.init.orthogonal_(p, gain=self.gain)
-					except: pass
+    def __init__(self, gain=1, pattern="."):
+        self.gain = gain
+        self.pattern = pattern
+        super(Orthogonal, self).__init__()
+
+    def __call__(self, submodel: nn.Module):
+        for n, p in submodel.named_parameters():
+            if re.search(self.pattern, n):
+                if "bias" in n:
+                    nn.init.constant_(p, val=0)
+                elif p.requires_grad:
+                    try:
+                        nn.init.orthogonal_(p, gain=self.gain)
+                    except:
+                        pass
--- a/pytorch_widedeep/losses.py
+++ b/pytorch_widedeep/losses.py
@@ -9,24 +9,28 @@ use_cuda = torch.cuda.is_available()


 class FocalLoss(nn.Module):
-    def __init__(self, alpha:float=0.25, gamma:float=1.):
+    def __init__(self, alpha: float = 0.25, gamma: float = 1.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

-    def get_weight(self, x:Tensor, t:Tensor) -> Tensor:
+    def get_weight(self, x: Tensor, t: Tensor) -> Tensor:
        p = x.sigmoid()
-        pt = p*t + (1-p)*(1-t)
-        w = self.alpha*t + (1-self.alpha)*(1-t)
-        return (w * (1-pt).pow(self.gamma)).detach()
+        pt = p * t + (1 - p) * (1 - t)  # type: ignore
+        w = self.alpha * t + (1 - self.alpha) * (1 - t)  # type: ignore
+        return (w * (1 - pt).pow(self.gamma)).detach()  # type: ignore

-    def forward(self, input:Tensor, target:Tensor) -> Tensor:
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:  # type: ignore
        if input.size(1) == 1:
-            input = torch.cat([1-input, input], axis=1)
+            input = torch.cat([1 - input, input], axis=1)  # type: ignore
            num_class = 2
-        else: num_class = input.size(1)
+        else:
+            num_class = input.size(1)
        binary_target = torch.eye(num_class)[target.long()]
-        if use_cuda: binary_target = binary_target.cuda()
+        if use_cuda:
+            binary_target = binary_target.cuda()
        binary_target = binary_target.contiguous()
        weight = self.get_weight(input, binary_target)
-        return F.binary_cross_entropy_with_logits(input, binary_target, weight, reduction='mean')
\ No newline at end of file
+        return F.binary_cross_entropy_with_logits(
+            input, binary_target, weight, reduction="mean"
+        )
--- a/pytorch_widedeep/metrics.py
+++ b/pytorch_widedeep/metrics.py
 import numpy as np
-import torch

 from .callbacks import Callback
 from .wdtypes import *


 class Metric(object):
+    def __init__(self):
+        self._name = ""

    def reset(self):
-        raise NotImplementedError('Custom Metrics must implement this function')
+        raise NotImplementedError("Custom Metrics must implement this function")

-    def __call__(self, y_pred:Tensor, y_true:Tensor):
-        raise NotImplementedError('Custom Metrics must implement this function')
+    def __call__(self, y_pred: Tensor, y_true: Tensor):
+        raise NotImplementedError("Custom Metrics must implement this function")


 class MultipleMetrics(object):
-
-    def __init__(self, metrics:List[Metric], prefix:str=''):
+    def __init__(self, metrics: List[Metric], prefix: str = ""):

        instantiated_metrics = []
        for metric in metrics:
-            if isinstance(metric, type): instantiated_metrics.append(metric())
-            else: instantiated_metrics.append(metric)
+            if isinstance(metric, type):
+                instantiated_metrics.append(metric())
+            else:
+                instantiated_metrics.append(metric)
        self._metrics = instantiated_metrics
        self.prefix = prefix

@@ -29,58 +31,56 @@ class MultipleMetrics(object):
        for metric in self._metrics:
            metric.reset()

-    def __call__(self, y_pred:Tensor, y_true:Tensor) -> Dict:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Dict:
        logs = {}
        for metric in self._metrics:
-            logs[self.prefix+metric._name] = metric(y_pred, y_true)
+            logs[self.prefix + metric._name] = metric(y_pred, y_true)
        return logs


 class MetricCallback(Callback):
-
-    def __init__(self, container:MultipleMetrics):
+    def __init__(self, container: MultipleMetrics):
        self.container = container
-    def on_epoch_begin(self, epoch:int, logs:Optional[Dict]=None):
+
+    def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
        self.container.reset()


 class CategoricalAccuracy(Metric):
-
    def __init__(self, top_k=1):
        self.top_k = top_k
        self.correct_count = 0
        self.total_count = 0

-        self._name = 'acc'
+        self._name = "acc"

    def reset(self):
        self.correct_count = 0
        self.total_count = 0

-    def __call__(self, y_pred:Tensor, y_true:Tensor) -> np.ndarray:
-        top_k = (y_pred.topk(self.top_k,1)[1])
-        true_k = y_true.view(len(y_true),1).expand_as(top_k)
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
+        top_k = y_pred.topk(self.top_k, 1)[1]
+        true_k = y_true.view(len(y_true), 1).expand_as(top_k)  # type: ignore
        self.correct_count += top_k.eq(true_k).float().sum().item()
-        self.total_count += len(y_pred)
+        self.total_count += len(y_pred)  # type: ignore
        accuracy = float(self.correct_count) / float(self.total_count)
        return np.round(accuracy, 4)


 class BinaryAccuracy(Metric):
-
    def __init__(self):
        self.correct_count = 0
        self.total_count = 0

-        self._name = 'acc'
+        self._name = "acc"

    def reset(self):
        self.correct_count = 0
        self.total_count = 0

-    def __call__(self, y_pred:Tensor, y_true:Tensor) -> np.ndarray:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
        y_pred_round = y_pred.round()
-        self.correct_count += y_pred_round.eq(y_true.view(-1,1)).float().sum().item()
-        self.total_count += len(y_pred)
+        self.correct_count += y_pred_round.eq(y_true.view(-1, 1)).float().sum().item()
+        self.total_count += len(y_pred)  # type: ignore
        accuracy = float(self.correct_count) / float(self.total_count)
-        return np.round(accuracy, 4)
\ No newline at end of file
+        return np.round(accuracy, 4)
--- a/pytorch_widedeep/models/__init__.py
+++ b/pytorch_widedeep/models/__init__.py
@@ -3,4 +3,4 @@ from .deep_dense import DeepDense
 from .deep_text import DeepText
 from .deep_image import DeepImage

-from .wide_deep import WideDeep
\ No newline at end of file
+from .wide_deep import WideDeep
--- a/pytorch_widedeep/models/_multiple_lr_scheduler.py
+++ b/pytorch_widedeep/models/_multiple_lr_scheduler.py
-import torch
-
 from ..wdtypes import *


 class MultipleLRScheduler(object):
-    def __init__(self, scheds:Dict[str,LRScheduler]):
+    def __init__(self, scheds: Dict[str, LRScheduler]):
        self._schedulers = scheds

    def step(self):

--- a/pytorch_widedeep/models/_multiple_optimizer.py
+++ b/pytorch_widedeep/models/_multiple_optimizer.py
-import torch
-
 from ..wdtypes import *


 class MultipleOptimizer(object):
-    def __init__(self, opts:Dict[str,Optimizer]):
+    def __init__(self, opts: Dict[str, Optimizer]):
        self._optimizers = opts

    def zero_grad(self):
@@ -13,4 +11,4 @@ class MultipleOptimizer(object):

    def step(self):
        for _, op in self._optimizers.items():
-            op.step()
\ No newline at end of file
+            op.step()
--- a/pytorch_widedeep/models/_multiple_transforms.py
+++ b/pytorch_widedeep/models/_multiple_transforms.py
@@ -2,15 +2,17 @@ from torchvision.transforms import Compose

 from ..wdtypes import *

-class MultipleTransforms(object):

-	def __init__(self, transforms:List):
+class MultipleTransforms(object):
+    def __init__(self, transforms: List[Transforms]):

-		instantiated_transforms = []
-		for transform in transforms:
-			if isinstance(transform, type): instantiated_transforms.append(transform())
-			else: instantiated_transforms.append(transform)
-		self._transforms = instantiated_transforms
+        instantiated_transforms = []
+        for transform in transforms:
+            if isinstance(transform, type):
+                instantiated_transforms.append(transform())
+            else:
+                instantiated_transforms.append(transform)
+        self._transforms = instantiated_transforms

-	def __call__(self):
-		return Compose(self._transforms)
\ No newline at end of file
+    def __call__(self):
+        return Compose(self._transforms)
--- a/pytorch_widedeep/models/_warmup.py
+++ b/pytorch_widedeep/models/_warmup.py
--- a/pytorch_widedeep/models/_wd_dataset.py
+++ b/pytorch_widedeep/models/_wd_dataset.py
@@ -28,26 +28,37 @@ class WideDeepDataset(Dataset):
    transforms: MultipleTransforms() object (which is in itself a torchvision
        Compose). See in models/_multiple_transforms.py
    """
-    def __init__(self, X_wide:Union[np.ndarray, sparse_matrix], X_deep:np.ndarray,
-        target:Optional[np.ndarray]=None, X_text:Optional[np.ndarray]=None,
-        X_img:Optional[np.ndarray]=None, transforms:Optional=None):
+
+    def __init__(
+        self,
+        X_wide: Union[np.ndarray, sparse_matrix],
+        X_deep: np.ndarray,
+        target: Optional[np.ndarray] = None,
+        X_text: Optional[np.ndarray] = None,
+        X_img: Optional[np.ndarray] = None,
+        transforms: Optional[Any] = None,
+    ):

        self.X_wide = X_wide
        self.X_deep = X_deep
        self.X_text = X_text
-        self.X_img  = X_img
+        self.X_img = X_img
        self.transforms = transforms
        if self.transforms:
-            self.transforms_names = [tr.__class__.__name__ for tr in self.transforms.transforms]
-        else: self.transforms_names = []
+            self.transforms_names = [
+                tr.__class__.__name__ for tr in self.transforms.transforms
+            ]
+        else:
+            self.transforms_names = []
        self.Y = target

-    def __getitem__(self, idx:int):
+    def __getitem__(self, idx: int):
        # X_wide and X_deep are assumed to be *always* present
        if isinstance(self.X_wide, sparse_matrix):
            X = Bunch(wide=np.array(self.X_wide[idx].todense()).squeeze())
-        else: X = Bunch(wide=self.X_wide[idx])
-        X.deepdense= self.X_deep[idx]
+        else:
+            X = Bunch(wide=self.X_wide[idx])
+        X.deepdense = self.X_deep[idx]
        if self.X_text is not None:
            X.deeptext = self.X_text[idx]
        if self.X_img is not None:
@@ -55,24 +66,29 @@ class WideDeepDataset(Dataset):
            # be ingested by the conv layers
            xdi = self.X_img[idx]
            # if int must be uint8
-            if 'int' in str(xdi.dtype) and 'uint8' != str(xdi.dtype): xdi = xdi.astype('uint8')
+            if "int" in str(xdi.dtype) and "uint8" != str(xdi.dtype):
+                xdi = xdi.astype("uint8")
            # if int float must be float32
-            if 'float' in str(xdi.dtype) and 'float32' != str(xdi.dtype): xdi = xdi.astype('float32')
+            if "float" in str(xdi.dtype) and "float32" != str(xdi.dtype):
+                xdi = xdi.astype("float32")
            # if there are no transforms, or these do not include ToTensor(),
            # then we need to  replicate what Tensor() does -> transpose axis
            # and normalize if necessary
-            if not self.transforms or 'ToTensor' not in self.transforms_names:
-                xdi = xdi.transpose(2,0,1)
-                if 'int' in str(xdi.dtype): xdi = (xdi/xdi.max()).astype('float32')
+            if not self.transforms or "ToTensor" not in self.transforms_names:
+                xdi = xdi.transpose(2, 0, 1)
+                if "int" in str(xdi.dtype):
+                    xdi = (xdi / xdi.max()).astype("float32")
            # if ToTensor() is included, simply apply transforms
-            if 'ToTensor' in self.transforms_names: xdi = self.transforms(xdi)
-            # else apply transforms on the result of calling torch.Tensor on
+            if "ToTensor" in self.transforms_names:
+                xdi = self.transforms(xdi)
+            # else apply transforms on the result of calling torch.tensor on
            # xdi after all the previous manipulation
-            elif self.transforms: xdi = self.transforms(torch.Tensor(xdi))
+            elif self.transforms:
+                xdi = self.transforms(torch.tensor(xdi))
            # fill the Bunch
            X.deepimage = xdi
        if self.Y is not None:
-            y  = self.Y[idx]
+            y = self.Y[idx]
            return X, y
        else:
            return X

--- a/pytorch_widedeep/models/deep_dense.py
+++ b/pytorch_widedeep/models/deep_dense.py
@@ -5,9 +5,10 @@ from torch import nn
 from ..wdtypes import *


-def dense_layer(inp:int, out:int, p:float=0., bn=False):
+def dense_layer(inp: int, out: int, p: float = 0.0, bn=False):
    layers = [nn.Linear(inp, out), nn.LeakyReLU(inplace=True)]
-    if bn: layers.append(nn.BatchNorm1d(out))
+    if bn:
+        layers.append(nn.BatchNorm1d(out))
    layers.append(nn.Dropout(p))
    return nn.Sequential(*layers)

@@ -70,14 +71,17 @@ class DeepDense(nn.Module):
            [ 6.7187e-02, -1.2821e-03, -3.0960e-04,  3.6123e-01]],
           grad_fn=<LeakyReluBackward1>)
    """
-    def __init__(self,
-        deep_column_idx:Dict[str,int],
-        hidden_layers:List[int],
-        batchnorm:bool=False,
-        dropout:Optional[List[float]]=None,
-        embed_input:Optional[List[Tuple[str,int,int]]]=None,
-        embed_p:float=0.,
-        continuous_cols:Optional[List[str]]=None):
+
+    def __init__(
+        self,
+        deep_column_idx: Dict[str, int],
+        hidden_layers: List[int],
+        batchnorm: bool = False,
+        dropout: Optional[List[float]] = None,
+        embed_input: Optional[List[Tuple[str, int, int]]] = None,
+        embed_p: float = 0.0,
+        continuous_cols: Optional[List[str]] = None,
+    ):

        super(DeepDense, self).__init__()
        self.embed_input = embed_input
@@ -86,38 +90,52 @@ class DeepDense(nn.Module):

        # Embeddings
        if self.embed_input is not None:
-            self.embed_layers = nn.ModuleDict({'emb_layer_'+col: nn.Embedding(val, dim)
-                for col, val, dim in self.embed_input})
+            self.embed_layers = nn.ModuleDict(
+                {
+                    "emb_layer_" + col: nn.Embedding(val, dim)
+                    for col, val, dim in self.embed_input
+                }
+            )
            self.embed_dropout = nn.Dropout(embed_p)
            emb_inp_dim = np.sum([embed[2] for embed in self.embed_input])
        else:
            emb_inp_dim = 0

        # Continuous
-        if self.continuous_cols is not None: cont_inp_dim = len(self.continuous_cols)
-        else: cont_inp_dim = 0
+        if self.continuous_cols is not None:
+            cont_inp_dim = len(self.continuous_cols)
+        else:
+            cont_inp_dim = 0

        # Dense Layers
        input_dim = emb_inp_dim + cont_inp_dim
        hidden_layers = [input_dim] + hidden_layers
-        if not dropout: dropout = [0.]*len(hidden_layers)
+        if not dropout:
+            dropout = [0.0] * len(hidden_layers)
        self.dense = nn.Sequential()
        for i in range(1, len(hidden_layers)):
            self.dense.add_module(
-                'dense_layer_{}'.format(i-1),
-                dense_layer( hidden_layers[i-1], hidden_layers[i], dropout[i-1], batchnorm))
+                "dense_layer_{}".format(i - 1),
+                dense_layer(
+                    hidden_layers[i - 1], hidden_layers[i], dropout[i - 1], batchnorm
+                ),
+            )

        # the output_dim attribute will be used as input_dim when "merging" the models
        self.output_dim = hidden_layers[-1]

-    def forward(self, X:Tensor)->Tensor:
+    def forward(self, X: Tensor) -> Tensor:  # type: ignore
        if self.embed_input is not None:
-            x = [self.embed_layers['emb_layer_'+col](X[:,self.deep_column_idx[col]].long())
-                for col,_,_ in self.embed_input]
-            x = torch.cat(x, 1)
-            x = self.embed_dropout(x)
+            x = [
+                self.embed_layers["emb_layer_" + col](
+                    X[:, self.deep_column_idx[col]].long()
+                )
+                for col, _, _ in self.embed_input
+            ]
+            x = torch.cat(x, 1)  # type: ignore
+            x = self.embed_dropout(x)  # type: ignore
        if self.continuous_cols is not None:
            cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]
            x_cont = X[:, cont_idx].float()
-            x = torch.cat([x, x_cont], 1) if self.embed_input is not None else x_cont
-        return self.dense(x)
\ No newline at end of file
+            x = torch.cat([x, x_cont], 1) if self.embed_input is not None else x_cont  # type: ignore
+        return self.dense(x)  # type: ignore
--- a/pytorch_widedeep/models/deep_image.py
+++ b/pytorch_widedeep/models/deep_image.py
-import torch
-
 from ..wdtypes import *

 from .deep_dense import dense_layer
@@ -8,14 +6,23 @@ from torch import nn
 from torchvision import models


-def conv_layer(ni:int, nf:int, ks:int=3, stride:int=1, maxpool:bool=True,
-    adaptiveavgpool:bool=False):
+def conv_layer(
+    ni: int,
+    nf: int,
+    ks: int = 3,
+    stride: int = 1,
+    maxpool: bool = True,
+    adaptiveavgpool: bool = False,
+):
    layer = nn.Sequential(
-        nn.Conv2d(ni, nf, kernel_size=ks, bias=True, stride=stride, padding=ks//2),
+        nn.Conv2d(ni, nf, kernel_size=ks, bias=True, stride=stride, padding=ks // 2),
        nn.BatchNorm2d(nf, momentum=0.01),
-        nn.LeakyReLU(negative_slope=0.1, inplace=True))
-    if maxpool: layer.add_module('maxpool', nn.MaxPool2d(2, 2))
-    if adaptiveavgpool: layer.add_module('adaptiveavgpool', nn.AdaptiveAvgPool2d(output_size=(1, 1)))
+        nn.LeakyReLU(negative_slope=0.1, inplace=True),
+    )
+    if maxpool:
+        layer.add_module("maxpool", nn.MaxPool2d(2, 2))
+    if adaptiveavgpool:
+        layer.add_module("adaptiveavgpool", nn.AdaptiveAvgPool2d(output_size=(1, 1)))
    return layer


@@ -75,23 +82,25 @@ class DeepImage(nn.Module):
              1.5416e-01,  3.9227e-01,  5.5048e-01]], grad_fn=<LeakyReluBackward1>)
    """

-    def __init__(self,
-        pretrained:bool=True,
-        resnet:int=18,
-        freeze:Union[str,int]=6,
-        head_layers:Optional[List[int]] = None,
-        head_dropout:Optional[List[float]]=None,
-        head_batchnorm:Optional[bool] = False):
+    def __init__(
+        self,
+        pretrained: bool = True,
+        resnet: int = 18,
+        freeze: Union[str, int] = 6,
+        head_layers: Optional[List[int]] = None,
+        head_dropout: Optional[List[float]] = None,
+        head_batchnorm: Optional[bool] = False,
+    ):
        super(DeepImage, self).__init__()

        self.head_layers = head_layers

        if pretrained:
-            if resnet==18:
+            if resnet == 18:
                vision_model = models.resnet18(pretrained=True)
-            elif resnet==34:
+            elif resnet == 34:
                vision_model = models.resnet34(pretrained=True)
-            elif resnet==50:
+            elif resnet == 50:
                vision_model = models.resnet50(pretrained=True)

            backbone_layers = list(vision_model.children())[:-1]
@@ -104,7 +113,9 @@ class DeepImage(nn.Module):
                    frozen_layers.append(layer)
                self.backbone = nn.Sequential(*frozen_layers)
            if isinstance(freeze, int):
-                assert freeze < 8, "freeze' must be less than 8 when using resnet architectures"
+                assert (
+                    freeze < 8
+                ), "freeze' must be less than 8 when using resnet architectures"
                frozen_layers = []
                trainable_layers = backbone_layers[freeze:]
                for layer in backbone_layers[:freeze]:
@@ -120,30 +131,38 @@ class DeepImage(nn.Module):
                conv_layer(64, 128, 1, maxpool=False),
                conv_layer(128, 256, 1, maxpool=False),
                conv_layer(256, 512, 1, maxpool=False, adaptiveavgpool=True),
-                )
+            )

        # the output_dim attribute will be used as input_dim when "merging" the models
        self.output_dim = 512

        if self.head_layers is not None:
-            assert self.head_layers[0]==self.output_dim, (
+            assert self.head_layers[0] == self.output_dim, (
                "The output dimension from the backbone ({}) is not consistent with "
                "the expected input dimension ({}) of the fc-head".format(
-                    self.output_dim, self.head_layers[0]))
-            if not head_dropout: head_dropout = [0.]*len(head_layers)
+                    self.output_dim, self.head_layers[0]
+                )
+            )
+            if not head_dropout:
+                head_dropout = [0.0] * len(head_layers)
            self.imagehead = nn.Sequential()
            for i in range(1, len(head_layers)):
                self.imagehead.add_module(
-                    'dense_layer_{}'.format(i-1),
-                    dense_layer(head_layers[i-1], head_layers[i], head_dropout[i-1], head_batchnorm)
-                    )
+                    "dense_layer_{}".format(i - 1),
+                    dense_layer(
+                        head_layers[i - 1],
+                        head_layers[i],
+                        head_dropout[i - 1],
+                        head_batchnorm,
+                    ),
+                )
            self.output_dim = head_layers[-1]

-    def forward(self, x:Tensor)->Tensor:
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        if self.head_layers is not None:
            out = self.imagehead(x)
            return out
        else:
-            return x
\ No newline at end of file
+            return x
--- a/pytorch_widedeep/models/deep_text.py
+++ b/pytorch_widedeep/models/deep_text.py
@@ -67,73 +67,104 @@ class DeepText(nn.Module):
            [-0.0844,  0.0681, -0.1016, -0.0464],
            [-0.0268,  0.0294, -0.0988, -0.0666]], grad_fn=<SelectBackward>)
    """
-    def __init__(self,
-        vocab_size:int,
-        hidden_dim:int=64,
-        n_layers:int=3,
-        rnn_dropout:float=0.,
-        bidirectional:bool=False,
-        padding_idx:int=1,
-        embed_dim:Optional[int]=None,
-        embedding_matrix:Optional[np.ndarray]=None,
-        head_layers:Optional[List[int]] = None,
-        head_dropout:Optional[List[float]]=None,
-        head_batchnorm:Optional[bool] = False):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        hidden_dim: int = 64,
+        n_layers: int = 3,
+        rnn_dropout: float = 0.0,
+        bidirectional: bool = False,
+        padding_idx: int = 1,
+        embed_dim: Optional[int] = None,
+        embedding_matrix: Optional[np.ndarray] = None,
+        head_layers: Optional[List[int]] = None,
+        head_dropout: Optional[List[float]] = None,
+        head_batchnorm: Optional[bool] = False,
+    ):
        super(DeepText, self).__init__()

-        if embed_dim is not None and embedding_matrix is not None and not embed_dim==embedding_matrix.shape[1]:
+        if (
+            embed_dim is not None
+            and embedding_matrix is not None
+            and not embed_dim == embedding_matrix.shape[1]
+        ):
            warnings.warn(
-                'the input embedding dimension {} and the dimension of the '
-                'pretrained embeddings {} do not match. The pretrained embeddings '
-                'dimension ({}) will be used'.format(embed_dim, embedding_matrix.shape[1],
-                    embedding_matrix.shape[1]), UserWarning)
+                "the input embedding dimension {} and the dimension of the "
+                "pretrained embeddings {} do not match. The pretrained embeddings "
+                "dimension ({}) will be used".format(
+                    embed_dim, embedding_matrix.shape[1], embedding_matrix.shape[1]
+                ),
+                UserWarning,
+            )

        self.bidirectional = bidirectional
        self.head_layers = head_layers

        # Pre-trained Embeddings
        if isinstance(embedding_matrix, np.ndarray):
-            self.word_embed = nn.Embedding(vocab_size, embedding_matrix.shape[1], padding_idx = padding_idx)
-            self.word_embed.weight = nn.Parameter(torch.Tensor(embedding_matrix))
+            assert (
+                embedding_matrix.dtype == "float32"
+            ), "'embedding_matrix' must be of dtype 'float32', got dtype '{}'".format(
+                str(embedding_matrix.dtype)
+            )
+            self.word_embed = nn.Embedding(
+                vocab_size, embedding_matrix.shape[1], padding_idx=padding_idx
+            )
+            self.word_embed.weight = nn.Parameter(
+                torch.tensor(embedding_matrix), requires_grad=True
+            )
            embed_dim = embedding_matrix.shape[1]
        else:
-            self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx = padding_idx)
+            self.word_embed = nn.Embedding(
+                vocab_size, embed_dim, padding_idx=padding_idx
+            )

        # stack of RNNs (LSTMs)
-        self.rnn = nn.LSTM(embed_dim,
+        self.rnn = nn.LSTM(
+            embed_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=rnn_dropout,
-            batch_first=True)
+            batch_first=True,
+        )

        # the output_dim attribute will be used as input_dim when "merging" the models
-        self.output_dim = hidden_dim*2 if bidirectional else hidden_dim
+        self.output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        if self.head_layers is not None:
-            assert self.head_layers[0]==self.output_dim, (
+            assert self.head_layers[0] == self.output_dim, (
                "The hidden dimension from the stack or RNNs ({}) is not consistent with "
                "the expected input dimension ({}) of the fc-head".format(
-                    self.output_dim, self.head_layers[0]))
-            if not head_dropout: head_dropout = [0.]*len(head_layers)
+                    self.output_dim, self.head_layers[0]
+                )
+            )
+            if not head_dropout:
+                head_dropout = [0.0] * len(head_layers)
            self.texthead = nn.Sequential()
            for i in range(1, len(head_layers)):
                self.texthead.add_module(
-                    'dense_layer_{}'.format(i-1),
-                    dense_layer(head_layers[i-1], head_layers[i], head_dropout[i-1], head_batchnorm)
-                    )
+                    "dense_layer_{}".format(i - 1),
+                    dense_layer(
+                        head_layers[i - 1],
+                        head_layers[i],
+                        head_dropout[i - 1],
+                        head_batchnorm,
+                    ),
+                )
            self.output_dim = head_layers[-1]

-    def forward(self, X:Tensor)->Tensor:
+    def forward(self, X: Tensor) -> Tensor:  # type: ignore

        embed = self.word_embed(X.long())
        o, (h, c) = self.rnn(embed)
        if self.bidirectional:
-            last_h = torch.cat((h[-2], h[-1]), dim = 1)
+            last_h = torch.cat((h[-2], h[-1]), dim=1)
        else:
            last_h = h[-1]
        if self.head_layers is not None:
-            out = self.head(last_h)
+            out = self.texthead(last_h)
            return out
        else:
-            return last_h
\ No newline at end of file
+            return last_h
--- a/pytorch_widedeep/models/wide.py
+++ b/pytorch_widedeep/models/wide.py
-import torch
-
 from torch import nn
 from ..wdtypes import *

@@ -10,32 +8,33 @@ class Wide(nn.Module):

    Parameters
    ----------
-	wide_dim: Int
+    wide_dim: Int
        size of the input tensor
-	output_dim: Int
+    output_dim: Int
        size of the ouput tensor

    Attributes
    ----------
-	wide_linear: nn.Module
+    wide_linear: nn.Module
        the linear layer that comprises the wide branch of the model

    Example
    --------
-	>>> import torch
-	>>> from pytorch_widedeep.models import Wide
-	>>> X = torch.empty(4, 4).random_(2)
-	>>> wide = Wide(wide_dim=X.size(0), output_dim=1)
-	>>> wide(X)
-	tensor([[-0.8841],
-	        [-0.8633],
-	        [-1.2713],
-	        [-0.4762]], grad_fn=<AddmmBackward>)
+    >>> import torch
+    >>> from pytorch_widedeep.models import Wide
+    >>> X = torch.empty(4, 4).random_(2)
+    >>> wide = Wide(wide_dim=X.size(0), output_dim=1)
+    >>> wide(X)
+    tensor([[-0.8841],
+            [-0.8633],
+            [-1.2713],
+            [-0.4762]], grad_fn=<AddmmBackward>)
    """
-    def __init__(self,wide_dim:int, output_dim:int=1):
+
+    def __init__(self, wide_dim: int, output_dim: int = 1):
        super(Wide, self).__init__()
        self.wide_linear = nn.Linear(wide_dim, output_dim)

-    def forward(self, X:Tensor)->Tensor:
+    def forward(self, X: Tensor) -> Tensor:  # type: ignore
        out = self.wide_linear(X.float())
        return out
--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
--- a/pytorch_widedeep/optim/radam.py
+++ b/pytorch_widedeep/optim/radam.py
 import math
 import torch
-from torch.optim.optimizer import Optimizer, required
+from torch.optim.optimizer import Optimizer

-class RAdam(Optimizer):

+class RAdam(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
@@ -29,65 +29,73 @@ class RAdam(Optimizer):

        for group in self.param_groups:

-            for p in group['params']:
+            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
-                    raise RuntimeError('RAdam does not support sparse gradients')
+                    raise RuntimeError("RAdam does not support sparse gradients")

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)

-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

-                state['step'] += 1
-                buffered = self.buffer[int(state['step'] % 10)]
-                if state['step'] == buffered[0]:
+                state["step"] += 1
+                buffered = self.buffer[int(state["step"] % 10)]
+                if state["step"] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
-                    buffered[0] = state['step']
-                    beta2_t = beta2 ** state['step']
+                    buffered[0] = state["step"]
+                    beta2_t = beta2 ** state["step"]
                    N_sma_max = 2 / (1 - beta2) - 1
-                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
-                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                        step_size = math.sqrt(
+                            (1 - beta2_t)
+                            * (N_sma - 4)
+                            / (N_sma_max - 4)
+                            * (N_sma - 2)
+                            / N_sma
+                            * N_sma_max
+                            / (N_sma_max - 2)
+                        ) / (1 - beta1 ** state["step"])
                    else:
-                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                        step_size = 1.0 / (1 - beta1 ** state["step"])
                    buffered[2] = step_size

-                if group['weight_decay'] != 0:
-                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
-                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+                    p_data_fp32.addcdiv_(-step_size * group["lr"], exp_avg, denom)
                else:
-                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                    p_data_fp32.add_(-step_size * group["lr"], exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

-class PlainRAdam(Optimizer):

+class PlainRAdam(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
@@ -113,46 +121,58 @@ class PlainRAdam(Optimizer):

        for group in self.param_groups:

-            for p in group['params']:
+            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
-                    raise RuntimeError('RAdam does not support sparse gradients')
+                    raise RuntimeError("RAdam does not support sparse gradients")

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)

-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

-                state['step'] += 1
-                beta2_t = beta2 ** state['step']
+                state["step"] += 1
+                beta2_t = beta2 ** state["step"]
                N_sma_max = 2 / (1 - beta2) - 1
-                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)

-                if group['weight_decay'] != 0:
-                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:
-                    step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    step_size = (
+                        group["lr"]
+                        * math.sqrt(
+                            (1 - beta2_t)
+                            * (N_sma - 4)
+                            / (N_sma_max - 4)
+                            * (N_sma - 2)
+                            / N_sma
+                            * N_sma_max
+                            / (N_sma_max - 2)
+                        )
+                        / (1 - beta1 ** state["step"])
+                    )
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                else:
-                    step_size = group['lr'] / (1 - beta1 ** state['step'])
+                    step_size = group["lr"] / (1 - beta1 ** state["step"])
                    p_data_fp32.add_(-step_size, exp_avg)

                p.data.copy_(p_data_fp32)
@@ -161,8 +181,9 @@ class PlainRAdam(Optimizer):


 class AdamW(Optimizer):
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0):
+    def __init__(
+        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0
+    ):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
@@ -172,8 +193,9 @@ class AdamW(Optimizer):
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, warmup = warmup)
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, warmup=warmup
+        )
        super(AdamW, self).__init__(params, defaults)

    def __setstate__(self, state):
@@ -186,49 +208,53 @@ class AdamW(Optimizer):

        for group in self.param_groups:

-            for p in group['params']:
+            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    raise RuntimeError(
+                        "Adam does not support sparse gradients, please consider SparseAdam instead"
+                    )

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
                else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)

-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]

-                state['step'] += 1
+                state["step"] += 1

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

-                denom = exp_avg_sq.sqrt().add_(group['eps'])
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]

-                if group['warmup'] > state['step']:
-                    scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
+                if group["warmup"] > state["step"]:
+                    scheduled_lr = 1e-8 + state["step"] * group["lr"] / group["warmup"]
                else:
-                    scheduled_lr = group['lr']
+                    scheduled_lr = group["lr"]

-                step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
+                step_size = (
+                    scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
+                )

-                if group['weight_decay'] != 0:
-                    p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * scheduled_lr, p_data_fp32)

                p_data_fp32.addcdiv_(-step_size, exp_avg, denom)

                p.data.copy_(p_data_fp32)

-        return loss
\ No newline at end of file
+        return loss
--- a/pytorch_widedeep/preprocessing/__init__.py
+++ b/pytorch_widedeep/preprocessing/__init__.py
 from ._preprocessors import WidePreprocessor
 from ._preprocessors import DeepPreprocessor
 from ._preprocessors import TextPreprocessor
-from ._preprocessors import ImagePreprocessor
\ No newline at end of file
+from ._preprocessors import ImagePreprocessor
--- a/pytorch_widedeep/preprocessing/_preprocessors.py
+++ b/pytorch_widedeep/preprocessing/_preprocessors.py
--- a/pytorch_widedeep/utils/__init__.py
+++ b/pytorch_widedeep/utils/__init__.py
 from .dense_utils import *
 from .text_utils import *
 from .fastai_transforms import *
-from .image_utils import *
\ No newline at end of file
+from .image_utils import *
--- a/pytorch_widedeep/utils/dense_utils.py
+++ b/pytorch_widedeep/utils/dense_utils.py
-import numpy as np
 import pandas as pd

 from ..wdtypes import *
@@ -7,11 +6,14 @@ from ..wdtypes import *
 pd.options.mode.chained_assignment = None


-__all__ = ['label_encoder']
+__all__ = ["label_encoder"]


-def label_encoder(df_inp:pd.DataFrame, cols:Optional[List[str]]=None,
-    val_to_idx:Optional[Dict[str,Dict[str,int]]]=None):
+def label_encoder(
+    df_inp: pd.DataFrame,
+    cols: Optional[List[str]] = None,
+    val_to_idx: Optional[Dict[str, Dict[str, int]]] = None,
+):
    r"""
    Label-encode some features of a given dataset.

@@ -33,12 +35,12 @@ def label_encoder(df_inp:pd.DataFrame, cols:Optional[List[str]]=None,
    """

    df = df_inp.copy()
-    if cols == None:
-        cols = list(df.select_dtypes(include=['object']).columns)
+    if cols is None:
+        cols = list(df.select_dtypes(include=["object"]).columns)

    if not val_to_idx:
        val_types = dict()
-        for c in cols:
+        for c in cols:  # type: ignore
            val_types[c] = df[c].unique()
        val_to_idx = dict()
        for k, v in val_types.items():
@@ -47,4 +49,4 @@ def label_encoder(df_inp:pd.DataFrame, cols:Optional[List[str]]=None,
    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

-    return df, val_to_idx
\ No newline at end of file
+    return df, val_to_idx
--- a/pytorch_widedeep/utils/fastai_transforms.py
+++ b/pytorch_widedeep/utils/fastai_transforms.py
--- a/pytorch_widedeep/utils/image_utils.py
+++ b/pytorch_widedeep/utils/image_utils.py
-'''
+"""
 AspectAwarePreprocessor and SimplePreprocessor are directly taked from the
 great series of Books "Deep Learning for Computer Vision" by Adrian
 (https://www.pyimagesearch.com/author/adrian/). Check here
 https://www.pyimagesearch.com/

 Credit for the code here to ADRIAN ROSEBROCK
-'''
+"""

 import numpy as np
 import imutils
 import cv2

-from ..wdtypes import *

-
-__all__ = ['AspectAwarePreprocessor', 'SimplePreprocessor']
+__all__ = ["AspectAwarePreprocessor", "SimplePreprocessor"]


 class AspectAwarePreprocessor:
-    def __init__(self, width:int, height:int, inter=cv2.INTER_AREA):
+    def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):
        self.width = width
        self.height = height
        self.inter = inter

-    def preprocess(self, image:np.ndarray)->np.ndarray:
+    def preprocess(self, image: np.ndarray) -> np.ndarray:
        (h, w) = image.shape[:2]
        dW = 0
        dH = 0

        if w < h:
-            image = imutils.resize(image, width=self.width,
-                inter=self.inter)
+            image = imutils.resize(image, width=self.width, inter=self.inter)
            dH = int((image.shape[0] - self.height) / 2.0)
        else:
-            image = imutils.resize(image, height=self.height,
-                inter=self.inter)
+            image = imutils.resize(image, height=self.height, inter=self.inter)
            dW = int((image.shape[1] - self.width) / 2.0)

        (h, w) = image.shape[:2]
-        image = image[dH:h - dH, dW:w - dW]
+        image = image[dH : h - dH, dW : w - dW]

-        return cv2.resize(image, (self.width, self.height),
-            interpolation=self.inter)
+        return cv2.resize(image, (self.width, self.height), interpolation=self.inter)


 class SimplePreprocessor:
-    def __init__(self, width:int, height:int, inter=cv2.INTER_AREA):
+    def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):
        self.width = width
        self.height = height
        self.inter = inter

-    def preprocess(self, image:np.ndarray)->np.ndarray:
-        return cv2.resize(image, (self.width, self.height),
-            interpolation=self.inter)
-
-
-
-
+    def preprocess(self, image: np.ndarray) -> np.ndarray:
+        return cv2.resize(image, (self.width, self.height), interpolation=self.inter)
--- a/pytorch_widedeep/utils/text_utils.py
+++ b/pytorch_widedeep/utils/text_utils.py
--- a/pytorch_widedeep/version.py
+++ b/pytorch_widedeep/version.py
-__version__ = '0.3.8'
\ No newline at end of file
+__version__ = "0.3.8"
--- a/pytorch_widedeep/wdtypes.py
+++ b/pytorch_widedeep/wdtypes.py
@@ -2,23 +2,83 @@ import sys

 from torch.nn import Module
 from torch import Tensor
-from torchvision.transforms import *
+from torchvision.transforms import (
+    CenterCrop,
+    ColorJitter,
+    Compose,
+    FiveCrop,
+    Grayscale,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    Pad,
+    RandomAffine,
+    RandomApply,
+    RandomChoice,
+    RandomCrop,
+    RandomGrayscale,
+    RandomHorizontalFlip,
+    RandomOrder,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomSizedCrop,
+    RandomVerticalFlip,
+    Resize,
+    Scale,
+    TenCrop,
+    ToPILImage,
+    ToTensor,
+)
 from torch.optim.optimizer import Optimizer
 from torch.utils.data.dataloader import DataLoader
 from torch.optim.lr_scheduler import _LRScheduler
 from pathlib import PosixPath
-from typing import (List, Any, Union, Dict, Callable, Optional, Tuple,
-	Generator, Collection, Iterable)
+from typing import (
+    List,
+    Any,
+    Union,
+    Dict,
+    Callable,
+    Optional,
+    Tuple,
+    Generator,
+    Collection,
+    Iterable,
+    Match,
+    Iterator,
+)
 from scipy.sparse.csr import csr_matrix as sparse_matrix
+from types import SimpleNamespace

-SimpleNamespace = type(sys.implementation)
-ListRules = Collection[Callable[[str],str]]
+
+ListRules = Collection[Callable[[str], str]]
 Tokens = Collection[Collection[str]]
-Transforms= Union[CenterCrop, ColorJitter, Compose, FiveCrop, Grayscale,
-	Lambda, LinearTransformation, Normalize, Pad, RandomAffine,
-	RandomApply, RandomChoice, RandomCrop, RandomGrayscale,
-	RandomHorizontalFlip, RandomOrder, RandomResizedCrop, RandomRotation,
-	RandomSizedCrop, RandomVerticalFlip, Resize, Scale, TenCrop,
-	ToPILImage, ToTensor]
+Transforms = Union[
+    CenterCrop,
+    ColorJitter,
+    Compose,
+    FiveCrop,
+    Grayscale,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    Pad,
+    RandomAffine,
+    RandomApply,
+    RandomChoice,
+    RandomCrop,
+    RandomGrayscale,
+    RandomHorizontalFlip,
+    RandomOrder,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomSizedCrop,
+    RandomVerticalFlip,
+    Resize,
+    Scale,
+    TenCrop,
+    ToPILImage,
+    ToTensor,
+]
 LRScheduler = _LRScheduler
-ModelParams = Generator[Tensor,Tensor,Tensor]
+ModelParams = Generator[Tensor, Tensor, Tensor]
--- a/setup.py
+++ b/setup.py
--- a/tests/test_data_utils/test_du_deep_dense.py
+++ b/tests/test_data_utils/test_du_deep_dense.py
--- a/tests/test_data_utils/test_du_deep_image.py
+++ b/tests/test_data_utils/test_du_deep_image.py
 import numpy as np
 import pandas as pd
 import os
-import pytest

 from pytorch_widedeep.preprocessing import ImagePreprocessor


 full_path = os.path.realpath(__file__)
 path = os.path.split(full_path)[0]
-df = pd.DataFrame({'galaxies': ['galaxy1.png', 'galaxy2.png']})
-img_col = 'galaxies'
-imd_dir = os.path.join(path,'images')
-processor = ImagePreprocessor()
-X_imgs = processor.fit_transform(df, img_col, img_path=imd_dir)
+df = pd.DataFrame({"galaxies": ["galaxy1.png", "galaxy2.png"]})
+img_col = "galaxies"
+imd_dir = os.path.join(path, "images")
+processor = ImagePreprocessor(img_col=img_col, img_path=imd_dir)
+X_imgs = processor.fit_transform(df)


 ###############################################################################
 # There is not much to test here, since I only resize.
 ###############################################################################
 def test_sizes():
-	img_width = X_imgs.shape[1]
-	img_height = X_imgs.shape[2]
-	assert np.all((img_width==processor.width, img_height==processor.height))
\ No newline at end of file
+    img_width = X_imgs.shape[1]
+    img_height = X_imgs.shape[2]
+    assert np.all((img_width == processor.width, img_height == processor.height))
--- a/tests/test_data_utils/test_du_deep_text.py
+++ b/tests/test_data_utils/test_du_deep_text.py
--- a/tests/test_data_utils/test_du_wide.py
+++ b/tests/test_data_utils/test_du_wide.py
--- a/tests/test_model_components/test_mc_deep_dense.py
+++ b/tests/test_model_components/test_mc_deep_dense.py
--- a/tests/test_model_components/test_mc_deep_image.py
+++ b/tests/test_model_components/test_mc_deep_image.py
--- a/tests/test_model_components/test_mc_deep_text.py
+++ b/tests/test_model_components/test_mc_deep_text.py
--- a/tests/test_model_components/test_mc_wide.py
+++ b/tests/test_model_components/test_mc_wide.py
 import torch
-import pytest

 from pytorch_widedeep.models import Wide

 inp = torch.rand(10, 10)
 model = Wide(10, 1)

+
 ###############################################################################
 # Simply testing that it runs
 ###############################################################################
 def test_wide():
-	out = model(inp)
-	assert out.size(0) == 10 and out.size(1) == 1
\ No newline at end of file
+    out = model(inp)
+    assert out.size(0) == 10 and out.size(1) == 1
--- a/tests/test_model_functioning/test_callbacks.py
+++ b/tests/test_model_functioning/test_callbacks.py
--- a/tests/test_model_functioning/test_data_inputs.py
+++ b/tests/test_model_functioning/test_data_inputs.py
--- a/tests/test_model_functioning/test_fit_methods.py
+++ b/tests/test_model_functioning/test_fit_methods.py
--- a/tests/test_model_functioning/test_focal_loss.py
+++ b/tests/test_model_functioning/test_focal_loss.py
--- a/tests/test_model_functioning/test_initializers.py
+++ b/tests/test_model_functioning/test_initializers.py
--- a/tests/test_model_functioning/test_metrics.py
+++ b/tests/test_model_functioning/test_metrics.py
--- a/tests/test_warm_up/test_warm_up_routines.py
+++ b/tests/test_warm_up/test_warm_up_routines.py