updated pycaret-nightly==0.28 part 1/3

29be74fd · PyCaret · f4da92cd · 29be74fd · 29be74fd · 29be74fd
隐藏空白更改
内联并排

Showing with 155 addition and 58 deletion

Dockerfile Dockerfile +11 -0

pycaret/anomaly.py pycaret/anomaly.py +35 -6

pycaret/clustering.py pycaret/clustering.py +90 -48

pycaret/nlp.py pycaret/nlp.py +19 -4

未找到文件。
--- a/Dockerfile
+++ b/Dockerfile
+FROM python:3.7-slim
+
+WORKDIR /app
+
+ADD . /app
+
+RUN apt-get update && apt-get install -y libgomp1
+
+RUN pip install --trusted-host pypi.python.org -r requirements.txt
+
+CMD pytest
\ No newline at end of file
--- a/pycaret/anomaly.py
+++ b/pycaret/anomaly.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 14/07/2020
+# Last modified : 20/07/2020

 def setup(data, 
          categorical_features = None,
@@ -1016,6 +1016,7 @@ def setup(data,
    if verbose:
        if html_param:
            clear_output()
+            print('Setup Succesfully Completed!')
            display(functions_)
        else:
            print(functions_.data)
@@ -1239,7 +1240,7 @@ def create_model(model = None,
        sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
        
    #checking fraction type:
-    if type(fraction) is not float:
+    if fraction <= 0 or fraction >= 1:
        sys.exit('(Type Error): Fraction parameter can only take value as float between 0 to 1.')
        
    #checking verbose parameter
@@ -1362,6 +1363,13 @@ def create_model(model = None,
        model = SOS(contamination=fraction, **kwargs)   
        full_name = 'Stochastic Outlier Selection'
    
+    else:    
+        def get_model_name(e):
+            return str(e).split("(")[0]
+
+        model == model
+        full_name = get_model_name(model)
+        
    logger.info(str(full_name) + ' Imported succesfully')

    #monitor update
@@ -1665,6 +1673,7 @@ def tune_model(model=None,
               method='drop',
               estimator=None,
               optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
               fold=10,
               verbose=True): #added in pycaret 2.0.0
    
@@ -1773,6 +1782,11 @@ def tune_model(model=None,
    
    optimize: string, default = None
    
+    custom_grid: list, default = None
+    By default, a pre-defined list of fraction values is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of fraction value to iterate over in custom_grid param.
+
    For Classification tasks:
    Accuracy, AUC, Recall, Precision, F1, Kappa
    
@@ -1884,7 +1898,11 @@ def tune_model(model=None,
    import datetime, time
    
    #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 15 + len(custom_grid)
+        
    progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
    
    if verbose:
@@ -1994,8 +2012,19 @@ def tune_model(model=None,
    progress.value += 1 
            
    #defining tuning grid
-    param_grid_with_zero = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
-    param_grid = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
+
+    if custom_grid is not None:
+
+        param_grid = custom_grid
+        param_grid_with_zero = [0]
+
+        for i in param_grid:
+            param_grid_with_zero.append(i)
+
+    else:
+        
+        param_grid_with_zero = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
+        param_grid = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] 
    
    master = []; master_df = []
    
@@ -2199,7 +2228,7 @@ def tune_model(model=None,
                                 supervised = True,
                                 supervised_target = supervised_target,
                                 session_id = seed,
-                                 logging = False, #added in pycaret==2.0.0
+                                 log_experiment = False, #added in pycaret==2.0.0
                                 profile=False,
                                 verbose=False)
    

--- a/pycaret/clustering.py
+++ b/pycaret/clustering.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 14/07/2020
+# Last modified : 20/07/2020

 def setup(data, 
        categorical_features = None,
@@ -27,7 +27,7 @@ def setup(data,
        rare_level_threshold = 0.10, 
        bin_numeric_features = None, 
        remove_multicollinearity = False,
-        multicollinearity_threshold = 0.9, 
+        multicollinearity_threshold = 0.9,
        group_features = None, 
        group_names = None, 
        supervised = False,
@@ -40,8 +40,8 @@ def setup(data,
        log_plots = False, #added in pycaret==2.0.0
        log_profile = False, #added in pycaret==2.0.0
        log_data = False, #added in pycaret==2.0.0
-        silent=False, #added in pycaret==2.0.0
-        verbose=True,
+        silent = False, #added in pycaret==2.0.0
+        verbose = True,
        profile = False,):
    
    """
@@ -252,13 +252,13 @@ def setup(data,
    unique number is then distributed as a seed in all functions used during the 
    experiment. This can be used for later reproducibility of the entire experiment.

-    experiment_name: str, default = None
-    Name of experiment for logging. When set to None, 'clf' is by default used as 
-    alias for the experiment name.
-
    log_experiment: bool, default = True
    When set to True, all metrics and parameters are logged on MLFlow server.

+    experiment_name: str, default = None
+    Name of experiment for logging. When set to None, 'clu' is by default used as 
+    alias for the experiment name.
+
    log_plots: bool, default = False
    When set to True, specific plots are logged in MLflow as a png file. By default,
    it is set to False. 
@@ -267,6 +267,9 @@ def setup(data,
    When set to True, data profile is also logged on MLflow as a html file. By default,
    it is set to False. 

+    log_data: bool, default = False
+    When set to True, train and test dataset are logged as csv. 
+
    silent: bool, default = False
    When set to True, confirmation of data types is not required. All preprocessing will 
    be performed assuming automatically inferred data types. Not recommended for direct use 
@@ -826,7 +829,7 @@ def setup(data,
                                       apply_binning = apply_binning_pass, 
                                       features_to_binn = features_to_bin_pass,
                                       remove_multicollinearity = remove_multicollinearity,
-                                       maximum_correlation_between_features = multicollinearity_threshold, 
+                                       maximum_correlation_between_features = multicollinearity_threshold,
                                       apply_grouping = apply_grouping_pass, 
                                       features_to_group_ListofList = group_features_pass,
                                       group_name = group_names_pass,
@@ -1012,8 +1015,10 @@ def setup(data,
    if verbose:
        if html_param:
            clear_output()
+            print('Setup Succesfully Completed!')
            display(functions_)
        else:
+            print('Setup Succesfully Completed!')
            print(functions_.data)
         
    if profile:
@@ -1163,24 +1168,31 @@ def create_model(model = None,

    Parameters
    ----------
-    model : string, default = None
-
-    Enter abbreviated string of the model class. List of available models supported:
-
-    Model                              Abbreviated String   Original Implementation 
-    ---------                          ------------------   -----------------------
-    K-Means Clustering                 'kmeans'             sklearn.cluster.KMeans.html
-    Affinity Propagation               'ap'                 AffinityPropagation.html
-    Mean shift Clustering              'meanshift'          sklearn.cluster.MeanShift.html
-    Spectral Clustering                'sc'                 SpectralClustering.html
-    Agglomerative Clustering           'hclust'             AgglomerativeClustering.html
-    Density-Based Spatial Clustering   'dbscan'             sklearn.cluster.DBSCAN.html
-    OPTICS Clustering                  'optics'             sklearn.cluster.OPTICS.html
-    Birch Clustering                   'birch'              sklearn.cluster.Birch.html
-    K-Modes Clustering                 'kmodes'             git/nicodv/kmodes
+    model : string / object, default = None
+    
+    Enter ID of the models available in model library or pass an untrained model 
+    object consistent with fit / predict API to train and evaluate model. List of 
+    models available in model library:
+
+    ID              Name   
+    ------          -----------
+    'kmeans'        K-Means Clustering
+    'ap'            Affinity Propagation
+    'meanshift'     Mean shift Clustering
+    'sc'            Spectral Clustering
+    'hclust'        Agglomerative Clustering
+    'dbscan'        Density-Based Spatial Clustering
+    'optics'        OPTICS Clustering                               
+    'birch'         Birch Clustering                                 
+    'kmodes'        K-Modes Clustering                              
    
    num_clusters: int, default = None
-    Number of clusters to be generated with the dataset. If None, num_clusters is set to 4. 
+    Number of clusters to be generated with the dataset. If None, num_clusters 
+    is set to 4. 
+
+    ground_truth: string, default = None
+    When ground_truth is provided, Homogeneity Score, Rand Index, and 
+    Completeness Score is evaluated and printer along with other metrics.

    verbose: Boolean, default = True
    Status update is not printed when verbose is set to False.
@@ -1194,7 +1206,12 @@ def create_model(model = None,
    Returns:
    --------

-    model:    trained model object
+    score grid:   A table containing the Silhouette, Calinski-Harabasz,  
+    -----------   Davies-Bouldin, Homogeneity Score, Rand Index, and 
+                  Completeness Score. Last 3 are only evaluated when
+                  ground_truth param is provided.
+
+    model:        trained model object
    ------

    Warnings:
@@ -1242,18 +1259,19 @@ def create_model(model = None,
    #checking for allowed models
    allowed_models = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch', 'kmodes']
    
+    if type(model) is str:
+        if model not in allowed_models:
+            sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
+
    #check num_clusters parameter:
    if num_clusters is not None:
        no_num_required = ['ap', 'meanshift', 'dbscan', 'optics']
        if model in no_num_required: 
            sys.exit('(Value Error): num_clusters parameter not required for specified model. Remove num_clusters to run this model.')
-    
-    if model not in allowed_models:
-        sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.')
        
    #checking num_clusters type:
    if num_clusters is not None:
-        if type(num_clusters) is not int:
+        if num_clusters <= 1:
            sys.exit('(Type Error): num_clusters parameter can only take value integer value greater than 1.')

    #check ground truth exist in data_
@@ -1355,6 +1373,13 @@ def create_model(model = None,
        from kmodes.kmodes import KModes
        model = KModes(n_clusters=num_clusters, n_jobs=n_jobs_param, random_state=seed, **kwargs)
        full_name = 'K-Modes Clustering'
+        
+    else:    
+        def get_model_name(e):
+            return str(e).split("(")[0]
+
+        model == model
+        full_name = get_model_name(model)

    logger.info(str(full_name) + ' Imported succesfully')

@@ -1383,7 +1408,7 @@ def create_model(model = None,
    
    try:
        silhouette = metrics.silhouette_score(X,model.labels_)
-        silhouette = silhouette.round(4)
+        silhouette = round(silhouette, 4)
        metric.append('Silhouette')
        metric_value.append(silhouette)
        
@@ -1392,7 +1417,7 @@ def create_model(model = None,

    try:
        chs = metrics.calinski_harabasz_score(X,model.labels_)
-        chs = chs.round(4)
+        chs = round(chs, 4)
        metric.append('Calinski-Harabasz')
        metric_value.append(chs)
    except:
@@ -1400,7 +1425,7 @@ def create_model(model = None,

    try:
        db = metrics.davies_bouldin_score(X,model.labels_)
-        db = db.round(4)
+        db = round(db, 4)
        metric.append('Davies-Bouldin')
        metric_value.append(db)

@@ -1415,7 +1440,7 @@ def create_model(model = None,

        try:
            hs = metrics.homogeneity_score(gt,model.labels_)
-            hs = hs.round(4)
+            hs = round(hs, 4)
            metric.append('Homogeneity Score')
            metric_value.append(hs)

@@ -1424,15 +1449,16 @@ def create_model(model = None,

        try:
            ari = metrics.adjusted_rand_score(gt,model.labels_)
-            ari = ari.round(4)
-            metric.append('ARI')
+            ari = round(ari,4)
+            metric.append('Rand Index')
            metric_value.append(ari)
+
        except:
            pass
-
+        
        try:
            cs = metrics.completeness_score(gt,model.labels_)
-            cs = cs.round(4)
+            cs = round(cs, 4)
            metric.append('Completeness Score')
            metric_value.append(cs)
        except:
@@ -1591,10 +1617,6 @@ def assign_model(model,

    dataframe:   Returns a dataframe with assigned clusters using a trained model.
    ---------
-
-    Warnings:
-    ---------
-    None
  
    """
    
@@ -1731,6 +1753,7 @@ def tune_model(model=None,
               supervised_target=None,
               estimator=None,
               optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
               fold=10,
               verbose=True): #added in pycaret 2.0.0
    
@@ -1826,6 +1849,11 @@ def tune_model(model=None,
    
    optimize: string, default = None
    
+    custom_grid: list, default = None
+    By default, a pre-defined number of clusters is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of num_clusters to iterate over in custom_grid param.
+
    For Classification tasks:
    Accuracy, AUC, Recall, Precision, F1, Kappa
    
@@ -1934,7 +1962,10 @@ def tune_model(model=None,
    import datetime, time
    
    #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 15 + len(custom_grid)

    progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
    
@@ -2036,9 +2067,20 @@ def tune_model(model=None,
    progress.value += 1 
            
    #defining tuning grid
-    param_grid_with_zero = [0, 4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
-    param_grid = [4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
-    
+    if custom_grid is not None:
+        
+        param_grid = custom_grid
+        param_grid_with_zero = [0]
+
+        for i in param_grid:
+            param_grid_with_zero.append(i)
+
+    else:
+                
+        param_grid = [4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
+        param_grid_with_zero = [0, 4, 5, 6, 8, 10, 14, 18, 25, 30, 40] 
+
+
    master = []; master_df = []
    
    monitor.iloc[1,1:] = 'Creating Clustering Model'
@@ -2236,13 +2278,13 @@ def tune_model(model=None,
                                 rare_level_threshold = combine_rare_threshold_pass, 
                                 bin_numeric_features = features_to_bin_pass,
                                 remove_multicollinearity = remove_multicollinearity_pass, 
-                                 multicollinearity_threshold = multicollinearity_threshold_pass, 
+                                 multicollinearity_threshold = multicollinearity_threshold_pass,
                                 group_features = group_features_pass,
                                 group_names = group_names_pass, 
                                 supervised = True,
                                 supervised_target = supervised_target,
                                 session_id = seed,
-                                 logging = False, #added in pycaret==2.0.0
+                                 log_experiment = False, #added in pycaret==2.0.0
                                 profile=False,
                                 verbose=False)
    

--- a/pycaret/nlp.py
+++ b/pycaret/nlp.py
@@ -2,7 +2,7 @@
 # Author: Moez Ali <moez.ali@queensu.ca>
 # License: MIT
 # Release: PyCaret 2.0x
-# Last modified : 09/07/2020
+# Last modified : 20/07/2020

 def setup(data, 
          target=None,
@@ -811,7 +811,7 @@ def create_model(model=None,
        
    #checking round parameter
    if num_topics is not None:
-        if type(num_topics) is not int:
+        if num_topics <= 1:
            sys.exit('(Type Error): num_topics parameter only accepts integer value.')
        
    #checking verbose parameter
@@ -2002,6 +2002,7 @@ def tune_model(model=None,
               supervised_target=None,
               estimator=None,
               optimize=None,
+               custom_grid = None, #added in pycaret 2.0.0
               auto_fe = True,
               fold=10,
               verbose=True): #added in pycaret==2.0.0
@@ -2103,6 +2104,11 @@ def tune_model(model=None,
    
    optimize: string, default = None
    
+    custom_grid: list, default = None
+    By default, a pre-defined number of topics is iterated over to 
+    optimize the supervised objective. To overwrite default iteration,
+    pass a list of num_topics to iterate over in custom_grid param.
+
    For Classification tasks:
    Accuracy, AUC, Recall, Precision, F1, Kappa
    
@@ -2230,7 +2236,11 @@ def tune_model(model=None,
    import datetime, time

    #progress bar
-    max_steps = 25
+    if custom_grid is None:
+        max_steps = 25
+    else:
+        max_steps = 10 + len(custom_grid)
+
    progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ')
    if verbose:
        if html_param:
@@ -2343,7 +2353,12 @@ def tune_model(model=None,
            progress.value += 1 

    #defining tuning grid
-    param_grid = [2,4,8,16,32,64,100,200,300,400] 
+
+    if custom_grid is not None:
+        param_grid = custom_grid
+    
+    else:
+        param_grid = [2,4,8,16,32,64,100,200,300,400] 

    master = []; master_df = []