From 29be74fdc81588b464c54d530e0ccdf9d48a4e12 Mon Sep 17 00:00:00 2001
From: PyCaret <moez@pycaret.org>
Date: Mon, 20 Jul 2020 13:58:45 -0400
Subject: [PATCH] updated pycaret-nightly==0.28 part 1/3

---
 Dockerfile            |  11 ++++
 pycaret/anomaly.py    |  41 +++++++++++--
 pycaret/clustering.py | 138 +++++++++++++++++++++++++++---------------
 pycaret/nlp.py        |  23 +++++--
 4 files changed, 155 insertions(+), 58 deletions(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..9df8fa0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.7-slim
+
+WORKDIR /app
+
+ADD . /app
+
+RUN apt-get update && apt-get install -y libgomp1
+
+RUN pip install --trusted-host pypi.python.org -r requirements.txt
+
+CMD pytest
\ No newline at end of file
diff --git a/pycaret/anomaly.py b/pycaret/anomaly.py
index fe5d85b..eab716d 100644
--- a/pycaret/anomaly.py
+++ b/pycaret/anomaly.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 14/07/2020
+# Last modified : 20/07/2020
 
 def setup(data, 
           categorical_features = None,
@@ -1016,6 +1016,7 @@ def setup(data,
     if verbose:
         if html_param:
             clear_output()
+            print('Setup Succesfully Completed!')
             display(functions_)
         else:
             print(functions_.data)
@@ -1239,7 +1240,7 @@ def create_model(model = None,
         sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
         
     #checking fraction type:
-    if type(fraction) is not float:
+    if fraction <= 0 or fraction >= 1:
         sys.exit('(Type Error): Fraction parameter can only take value as float between 0 to 1.')
         
     #checking verbose parameter
@@ -1362,6 +1363,13 @@ def create_model(model = None,
         model = SOS(contamination=fraction, **kwargs)   
         full_name = 'Stochastic Outlier Selection'
     
+    else:    
+        def get_model_name(e):
+            return str(e).split("(")[0]
+
+        model == model
+        full_name = get_model_name(model)
+        
     logger.info(str(full_name) + ' Imported succesfully')
 
     #monitor update
@@ -1665,6 +1673,7 @@ def tune_model(model=None,
                method='drop',
                estimator=None,
                optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
                fold=10,
                verbose=True): #added in pycaret 2.0.0
     
@@ -1773,6 +1782,11 @@ def tune_model(model=None,
     
     optimize: string, default = None
     
+    custom_grid: list, default = None
+    By default, a pre-defined list of fraction values is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of fraction value to iterate over in custom_grid param.
+
     For Classification tasks:
     Accuracy, AUC, Recall, Precision, F1, Kappa
     
@@ -1884,7 +1898,11 @@ def tune_model(model=None,
     import datetime, time
     
     #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 15 + len(custom_grid)
+        
     progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
     
     if verbose:
@@ -1994,8 +2012,19 @@ def tune_model(model=None,
     progress.value += 1 
             
     #defining tuning grid
-    param_grid_with_zero = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
-    param_grid = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
+
+    if custom_grid is not None:
+
+        param_grid = custom_grid
+        param_grid_with_zero = [0]
+
+        for i in param_grid:
+            param_grid_with_zero.append(i)
+
+    else:
+        
+        param_grid_with_zero = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
+        param_grid = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
     
     master = []; master_df = []
     
@@ -2199,7 +2228,7 @@ def tune_model(model=None,
                                  supervised = True,
                                  supervised_target = supervised_target,
                                  session_id = seed,
-                                 logging = False, #added in pycaret==2.0.0
+                                 log_experiment = False, #added in pycaret==2.0.0
                                  profile=False,
                                  verbose=False)
     
diff --git a/pycaret/clustering.py b/pycaret/clustering.py
index e0c9d1b..4c403cd 100644
--- a/pycaret/clustering.py
+++ b/pycaret/clustering.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 14/07/2020
+# Last modified : 20/07/2020
 
 def setup(data, 
         categorical_features = None,
@@ -27,7 +27,7 @@ def setup(data,
         rare_level_threshold = 0.10, 
         bin_numeric_features = None, 
         remove_multicollinearity = False,
-        multicollinearity_threshold = 0.9, 
+        multicollinearity_threshold = 0.9,
         group_features = None, 
         group_names = None, 
         supervised = False,
@@ -40,8 +40,8 @@ def setup(data,
         log_plots = False, #added in pycaret==2.0.0
         log_profile = False, #added in pycaret==2.0.0
         log_data = False, #added in pycaret==2.0.0
-        silent=False, #added in pycaret==2.0.0
-        verbose=True,
+        silent = False, #added in pycaret==2.0.0
+        verbose = True,
         profile = False,):
     
     """
@@ -252,13 +252,13 @@ def setup(data,
     unique number is then distributed as a seed in all functions used during the 
     experiment. This can be used for later reproducibility of the entire experiment.
 
-    experiment_name: str, default = None
-    Name of experiment for logging. When set to None, 'clf' is by default used as 
-    alias for the experiment name.
-
     log_experiment: bool, default = True
     When set to True, all metrics and parameters are logged on MLFlow server.
 
+    experiment_name: str, default = None
+    Name of experiment for logging. When set to None, 'clu' is by default used as 
+    alias for the experiment name.
+
     log_plots: bool, default = False
     When set to True, specific plots are logged in MLflow as a png file. By default,
     it is set to False. 
@@ -267,6 +267,9 @@ def setup(data,
     When set to True, data profile is also logged on MLflow as a html file. By default,
     it is set to False. 
 
+    log_data: bool, default = False
+    When set to True, train and test dataset are logged as csv. 
+
     silent: bool, default = False
     When set to True, confirmation of data types is not required. All preprocessing will 
     be performed assuming automatically inferred data types. Not recommended for direct use 
@@ -826,7 +829,7 @@ def setup(data,
                                        apply_binning = apply_binning_pass, 
                                        features_to_binn = features_to_bin_pass,
                                        remove_multicollinearity = remove_multicollinearity,
-                                       maximum_correlation_between_features = multicollinearity_threshold, 
+                                       maximum_correlation_between_features = multicollinearity_threshold,
                                        apply_grouping = apply_grouping_pass, 
                                        features_to_group_ListofList = group_features_pass,
                                        group_name = group_names_pass,
@@ -1012,8 +1015,10 @@ def setup(data,
     if verbose:
         if html_param:
             clear_output()
+            print('Setup Succesfully Completed!')
             display(functions_)
         else:
+            print('Setup Succesfully Completed!')
             print(functions_.data)
          
     if profile:
@@ -1163,24 +1168,31 @@ def create_model(model = None,
 
     Parameters
     ----------
-    model : string, default = None
-
-    Enter abbreviated string of the model class. List of available models supported:
-
-    Model                              Abbreviated String   Original Implementation 
-    ---------                          ------------------   -----------------------
-    K-Means Clustering                 'kmeans'             sklearn.cluster.KMeans.html
-    Affinity Propagation               'ap'                 AffinityPropagation.html
-    Mean shift Clustering              'meanshift'          sklearn.cluster.MeanShift.html
-    Spectral Clustering                'sc'                 SpectralClustering.html
-    Agglomerative Clustering           'hclust'             AgglomerativeClustering.html
-    Density-Based Spatial Clustering   'dbscan'             sklearn.cluster.DBSCAN.html
-    OPTICS Clustering                  'optics'             sklearn.cluster.OPTICS.html
-    Birch Clustering                   'birch'              sklearn.cluster.Birch.html
-    K-Modes Clustering                 'kmodes'             git/nicodv/kmodes
+    model : string / object, default = None
+    
+    Enter ID of the models available in model library or pass an untrained model 
+    object consistent with fit / predict API to train and evaluate model. List of 
+    models available in model library:
+
+    ID              Name   
+    ------          -----------
+    'kmeans'        K-Means Clustering
+    'ap'            Affinity Propagation
+    'meanshift'     Mean shift Clustering
+    'sc'            Spectral Clustering
+    'hclust'        Agglomerative Clustering
+    'dbscan'        Density-Based Spatial Clustering
+    'optics'        OPTICS Clustering                               
+    'birch'         Birch Clustering                                 
+    'kmodes'        K-Modes Clustering                              
     
     num_clusters: int, default = None
-    Number of clusters to be generated with the dataset. If None, num_clusters is set to 4. 
+    Number of clusters to be generated with the dataset. If None, num_clusters 
+    is set to 4. 
+
+    ground_truth: string, default = None
+    When ground_truth is provided, Homogeneity Score, Rand Index, and 
+    Completeness Score is evaluated and printer along with other metrics.
 
     verbose: Boolean, default = True
     Status update is not printed when verbose is set to False.
@@ -1194,7 +1206,12 @@ def create_model(model = None,
     Returns:
     --------
 
-    model:    trained model object
+    score grid:   A table containing the Silhouette, Calinski-Harabasz,  
+    -----------   Davies-Bouldin, Homogeneity Score, Rand Index, and 
+                  Completeness Score. Last 3 are only evaluated when
+                  ground_truth param is provided.
+
+    model:        trained model object
     ------
 
     Warnings:
@@ -1242,18 +1259,19 @@ def create_model(model = None,
     #checking for allowed models
     allowed_models = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch', 'kmodes']
     
+    if type(model) is str:
+        if model not in allowed_models:
+            sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
+
     #check num_clusters parameter:
     if num_clusters is not None:
         no_num_required = ['ap', 'meanshift', 'dbscan', 'optics']
         if model in no_num_required: 
             sys.exit('(Value Error): num_clusters parameter not required for specified model. Remove num_clusters to run this model.')
-    
-    if model not in allowed_models:
-        sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
         
     #checking num_clusters type:
     if num_clusters is not None:
-        if type(num_clusters) is not int:
+        if num_clusters <= 1:
             sys.exit('(Type Error): num_clusters parameter can only take value integer value greater than 1.')
 
     #check ground truth exist in data_
@@ -1355,6 +1373,13 @@ def create_model(model = None,
         from kmodes.kmodes import KModes
         model = KModes(n_clusters=num_clusters, n_jobs=n_jobs_param, random_state=seed, **kwargs)
         full_name = 'K-Modes Clustering'
+        
+    else:    
+        def get_model_name(e):
+            return str(e).split("(")[0]
+
+        model == model
+        full_name = get_model_name(model)
 
     logger.info(str(full_name) + ' Imported succesfully')
 
@@ -1383,7 +1408,7 @@ def create_model(model = None,
     
     try:
         silhouette = metrics.silhouette_score(X,model.labels_)
-        silhouette = silhouette.round(4)
+        silhouette = round(silhouette, 4)
         metric.append('Silhouette')
         metric_value.append(silhouette)
         
@@ -1392,7 +1417,7 @@ def create_model(model = None,
 
     try:
         chs = metrics.calinski_harabasz_score(X,model.labels_)
-        chs = chs.round(4)
+        chs = round(chs, 4)
         metric.append('Calinski-Harabasz')
         metric_value.append(chs)
     except:
@@ -1400,7 +1425,7 @@ def create_model(model = None,
 
     try:
         db = metrics.davies_bouldin_score(X,model.labels_)
-        db = db.round(4)
+        db = round(db, 4)
         metric.append('Davies-Bouldin')
         metric_value.append(db)
 
@@ -1415,7 +1440,7 @@ def create_model(model = None,
 
         try:
             hs = metrics.homogeneity_score(gt,model.labels_)
-            hs = hs.round(4)
+            hs = round(hs, 4)
             metric.append('Homogeneity Score')
             metric_value.append(hs)
 
@@ -1424,15 +1449,16 @@ def create_model(model = None,
 
         try:
             ari = metrics.adjusted_rand_score(gt,model.labels_)
-            ari = ari.round(4)
-            metric.append('ARI')
+            ari = round(ari,4)
+            metric.append('Rand Index')
             metric_value.append(ari)
+
         except:
             pass
-
+        
         try:
             cs = metrics.completeness_score(gt,model.labels_)
-            cs = cs.round(4)
+            cs = round(cs, 4)
             metric.append('Completeness Score')
             metric_value.append(cs)
         except:
@@ -1591,10 +1617,6 @@ def assign_model(model,
 
     dataframe:   Returns a dataframe with assigned clusters using a trained model.
     ---------
-
-    Warnings:
-    ---------
-    None
   
     """
     
@@ -1731,6 +1753,7 @@ def tune_model(model=None,
                supervised_target=None,
                estimator=None,
                optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
                fold=10,
                verbose=True): #added in pycaret 2.0.0
     
@@ -1826,6 +1849,11 @@ def tune_model(model=None,
     
     optimize: string, default = None
     
+    custom_grid: list, default = None
+    By default, a pre-defined number of clusters is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of num_clusters to iterate over in custom_grid param.
+
     For Classification tasks:
     Accuracy, AUC, Recall, Precision, F1, Kappa
     
@@ -1934,7 +1962,10 @@ def tune_model(model=None,
     import datetime, time
     
     #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 15 + len(custom_grid)
 
     progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
     
@@ -2036,9 +2067,20 @@ def tune_model(model=None,
     progress.value += 1 
             
     #defining tuning grid
-    param_grid_with_zero = [0, 4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
-    param_grid = [4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
-    
+    if custom_grid is not None:
+        
+        param_grid = custom_grid
+        param_grid_with_zero = [0]
+
+        for i in param_grid:
+            param_grid_with_zero.append(i)
+
+    else:
+                
+        param_grid = [4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
+        param_grid_with_zero = [0, 4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
+
+
     master = []; master_df = []
     
     monitor.iloc[1,1:] = 'Creating Clustering Model'
@@ -2236,13 +2278,13 @@ def tune_model(model=None,
                                  rare_level_threshold = combine_rare_threshold_pass, 
                                  bin_numeric_features = features_to_bin_pass,
                                  remove_multicollinearity = remove_multicollinearity_pass, 
-                                 multicollinearity_threshold = multicollinearity_threshold_pass, 
+                                 multicollinearity_threshold = multicollinearity_threshold_pass,
                                  group_features = group_features_pass,
                                  group_names = group_names_pass, 
                                  supervised = True,
                                  supervised_target = supervised_target,
                                  session_id = seed,
-                                 logging = False, #added in pycaret==2.0.0
+                                 log_experiment = False, #added in pycaret==2.0.0
                                  profile=False,
                                  verbose=False)
     
diff --git a/pycaret/nlp.py b/pycaret/nlp.py
index edc07e3..c970c73 100644
--- a/pycaret/nlp.py
+++ b/pycaret/nlp.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 09/07/2020
+# Last modified : 20/07/2020
 
 def setup(data, 
           target=None,
@@ -811,7 +811,7 @@ def create_model(model=None,
         
     #checking round parameter
     if num_topics is not None:
-        if type(num_topics) is not int:
+        if num_topics <= 1:
             sys.exit('(Type Error): num_topics parameter only accepts integer value.')
         
     #checking verbose parameter
@@ -2002,6 +2002,7 @@ def tune_model(model=None,
                supervised_target=None,
                estimator=None,
                optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
                auto_fe = True,
                fold=10,
                verbose=True): #added in pycaret==2.0.0
@@ -2103,6 +2104,11 @@ def tune_model(model=None,
     
     optimize: string, default = None
     
+    custom_grid: list, default = None
+    By default, a pre-defined number of topics is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of num_topics to iterate over in custom_grid param.
+
     For Classification tasks:
     Accuracy, AUC, Recall, Precision, F1, Kappa
     
@@ -2230,7 +2236,11 @@ def tune_model(model=None,
     import datetime, time
 
     #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 10 + len(custom_grid)
+
     progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
     if verbose:
         if html_param:
@@ -2343,7 +2353,12 @@ def tune_model(model=None,
             progress.value += 1 
 
     #defining tuning grid
-    param_grid = [2,4,8,16,32,64,100,200,300,400] 
+
+    if custom_grid is not None:
+        param_grid = custom_grid
+    
+    else:
+        param_grid = [2,4,8,16,32,64,100,200,300,400] 
 
     master = []; master_df = []
 
-- 
GitLab