Add files via upload

de62af2c · PyCaret · GitHub · 136558c8 · de62af2c · de62af2c
展开全部隐藏空白更改
内联并排

Showing with 740 addition and 189 deletion

anomaly.py anomaly.py +27 -7

classification.py classification.py +116 -11

clustering.py clustering.py +56 -11

regression.py regression.py +541 -160

未找到文件。
--- a/anomaly.py
+++ b/anomaly.py
@@ -2095,7 +2095,8 @@ def tune_model(model=None,


 def plot_model(model,
-               plot = 'tsne'):
+               plot = 'tsne',
+               feature = None):
    
    
    """
@@ -2129,6 +2130,9 @@ def plot_model(model,
    t-SNE (3d) Dimension Plot      'tsne'
    UMAP Dimensionality Plot       'umap'

+    feature : string, default = None
+    feature column is used as a hoverover tooltip. By default, first of column of the
+    dataset is chosen as hoverover tooltip, when no feature is passed.
    
    Returns:
    --------
@@ -2190,11 +2194,19 @@ def plot_model(model,

        X = pd.DataFrame(X_embedded)
        X['Label'] = Label
+        
+        if feature is not None: 
+            X['Feature'] = data_[feature]
+        else:
+            X['Feature'] = data_[data_.columns[0]]

        import plotly.express as px
        df = X
-        fig = px.scatter_3d(df, x=0, y=1, z=2,
-                      color='Label', title='3d TSNE Plot for Outliers', opacity=0.7, width=900, height=800)
+            
+        fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data=['Feature'], color='Label', title='3d TSNE Plot for Outliers', 
+                                opacity=0.7, width=900, height=800)
+            
+            
        fig.show()
        
    elif plot == 'umap':
@@ -2213,8 +2225,15 @@ def plot_model(model,
        import plotly.express as px
        df = X
        df['Label'] = Label
+        
+        if feature is not None: 
+            df['Feature'] = data_[feature]
+        else:
+            df['Feature'] = data_[data_.columns[0]]
+            
        fig = px.scatter(df, x=0, y=1,
-                      color='Label', title='uMAP Plot for Outliers', opacity=0.7, width=900, height=800)
+                      color='Label', title='uMAP Plot for Outliers', hover_data=['Feature'], opacity=0.7, 
+                         width=900, height=800)
        fig.show() 


@@ -2577,6 +2596,8 @@ def deploy_model(model,
       
    Description:
    ------------
+    (In Preview)
+
    This function deploys the transformation pipeline and trained model object for
    production use. The platform of deployment can be defined under the platform
    param along with the applicable authentication tokens which are passed as a
@@ -2704,7 +2725,7 @@ def get_outliers(data,
                                       Power_transform_data = transformation,
                                       Power_transform_method = 'yj',
                                       apply_pca = pca,
-                                       pca_variance_retained=pca_components,
+                                       pca_variance_retained_or_number_of_components=pca_components,
                                       random_state = seed)
    
    
@@ -2713,5 +2734,4 @@ def get_outliers(data,
    
    dataset = assign_model(c, verbose=False)
    
-    return dataset
-
+    return dataset
\ No newline at end of file
--- a/classification.py
+++ b/classification.py
@@ -25,6 +25,12 @@ def setup(data,
          combine_rare_levels = False, #new
          rare_level_threshold = 0.10, #new
          bin_numeric_features = None, #new
+          remove_outliers = False, #new
+          outliers_threshold = 0.05, #new
+          remove_multicollinearity = False, #new
+          multicollinearity_threshold = 0.9, #new
+          create_clusters = False, #new
+          cluster_iter = 20, #new
          session_id = None,
          profile = False):
    
@@ -163,8 +169,6 @@ def setup(data,
    incremental : replacement for 'linear' pca when the dataset to be decomposed is 
                  too large to fit in memory
    
-    pls         : dimensionality reduction through supervised PLSregression technique.
-    
    pca_components: int/float, default = 0.99
    Number of components to keep. if pca_components is a float, it is treated as 
    goal percentage for information retention. When pca_components param is integer
@@ -195,6 +199,36 @@ def setup(data,
    It is only optimal for gaussian data and underestimates number of bins for large 
    non-gaussian datasets.
    
+    remove_outliers: bool, default = False
+    When set to True, outliers from the training data is removed using ensemble of 
+    Isolation Forest, K Nearest Neighbour and PCA Outlier detector. All of them are
+    unsupervised techniques. The contamination percentage is defined using the
+    outliers_threshold parameter.
+    
+    outliers_threshold: float, default = 0.05
+    The percentage / proportion of outliers in the dataset can be defined using
+    outliers_threshold param. By default, 0.05 is used which means 0.025 on each
+    side of distribution tail is dropped from training data.
+    
+    remove_multicollinearity: bool, default = False
+    When set to True, it drops the variables with inter-correlations higher than
+    the threshold defined under multicollinearity_threshold param. When two features
+    are highly correlated with each other, feature with less correlation with target
+    variable is dropped.
+    
+    multicollinearity_threshold: float, default = 0.9
+    Threshold used for dropping the correlated features. Only comes into effect when 
+    remove_multicollinearity is set to True.
+    
+    create_clusters: bool, default = False
+    When set to True, an additional feature is created where each instance is assigned
+    to a cluster. Number of clusters is determined using combination of Calinski-Harabasz 
+    Silhouette criterion. 
+    
+    cluster_iter: int, default = 20
+    Number of iterations for creating cluster. Each iteration represent cluster size.
+    Only comes into effect when create_clusters param is set to True.
+    
    session_id: int, default = None
    If None, a random seed is generated and returned in the Information grid. The 
    unique number is then distributed as a seed in all functions used during the 
@@ -279,9 +313,9 @@ def setup(data,
        sys.exit('(Type Error): PCA parameter only accepts True or False.')
        
    #pca method check
-    allowed_pca_methods = ['linear', 'kernel', 'incremental', 'pls']
+    allowed_pca_methods = ['linear', 'kernel', 'incremental']
    if pca_method not in allowed_pca_methods:
-        sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', 'incremental', or 'pls'. ")    
+        sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', or 'incremental'. ")    
    
    #pca components check
    if pca is True:
@@ -326,6 +360,35 @@ def setup(data,
            if i not in all_cols:
                sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.")

+    #remove_outliers
+    if type(remove_outliers) is not bool:
+        sys.exit('(Type Error): remove_outliers parameter only accepts True or False.')    
+    
+    #outliers_threshold
+    if type(outliers_threshold) is not float:
+        sys.exit('(Type Error): outliers_threshold must be a float between 0 and 1. ')   
+        
+    #remove_multicollinearity
+    if type(remove_multicollinearity) is not bool:
+        sys.exit('(Type Error): remove_multicollinearity parameter only accepts True or False.')
+        
+    #multicollinearity_threshold
+    if type(multicollinearity_threshold) is not float:
+        sys.exit('(Type Error): multicollinearity_threshold must be a float between 0 and 1. ')  
+        
+    #multicollinearity and multiclass check
+    if data[target].value_counts().count() > 2:
+        if remove_multicollinearity is True:
+            sys.exit('(Type Error): remove_multicollinearity cannot be used when target is multiclass. ')  
+    
+    #create_clusters
+    if type(create_clusters) is not bool:
+        sys.exit('(Type Error): create_clusters parameter only accepts True or False.')
+        
+    #cluster_iter
+    if type(cluster_iter) is not int:
+        sys.exit('(Type Error): cluster_iter must be a integer greater than 1. ')                 
+
    #cannot drop target
    if ignore_features is not None:
        if target in ignore_features:
@@ -516,8 +579,14 @@ def setup(data,
                                          rara_level_threshold_percentage = rare_level_threshold, #new
                                          apply_binning = apply_binning_pass, #new
                                          features_to_binn = features_to_bin_pass, #new
+                                          remove_outliers = remove_outliers, #new
+                                          outlier_contamination_percentage = outliers_threshold, #new
+                                          remove_multicollinearity = remove_multicollinearity, #new
+                                          maximum_correlation_between_features = multicollinearity_threshold, #new
+                                          cluster_entire_data = create_clusters, #new
+                                          range_of_clusters_to_try = cluster_iter, #new
                                          display_types = True, #this is for inferred input box
-                                          target_transformation = False, #to be dealt later
+                                          target_transformation = False, #not needed for classification
                                          random_state = seed)

    progress.value += 1
@@ -530,10 +599,10 @@ def setup(data,

    else:
        label_encoded = 'None'
-
-
+    
    res_type = ['quit','Quit','exit','EXIT','q','Q','e','E','QUIT','Exit']
    res = preprocess.dtypes.response
+        
    if res in res_type:
        sys.exit("(Process Exit): setup has been interupted with user command 'quit'. setup must rerun." )
    
@@ -577,6 +646,21 @@ def setup(data,
    else:
        numeric_bin_grid = 'True'
    
+    if remove_outliers is False:
+        outliers_threshold_grid = None
+    else:
+        outliers_threshold_grid = outliers_threshold
+    
+    if remove_multicollinearity is False:
+        multicollinearity_threshold_grid = None
+    else:
+        multicollinearity_threshold_grid = multicollinearity_threshold
+    
+    if create_clusters is False:
+        cluster_iter_grid = None
+    else:
+        cluster_iter_grid = cluster_iter
+        
    learned_types = preprocess.dtypes.learent_dtypes
    learned_types.drop(target, inplace=True)

@@ -812,6 +896,12 @@ def setup(data,
                                         ['Combine Rare Levels ', combine_rare_levels],
                                         ['Rare Level Threshold ', rare_level_threshold_grid],
                                         ['Numeric Binning ', numeric_bin_grid],
+                                         ['Remove Outliers ', remove_outliers],
+                                         ['Outliers Threshold ', outliers_threshold_grid],
+                                         ['Remove Multicollinearity ', remove_multicollinearity],
+                                         ['Multicollinearity Threshold ', multicollinearity_threshold_grid],
+                                         ['Clustering ', create_clusters],
+                                         ['Clustering Iteration ', cluster_iter_grid],
                                         ['Missing Values ', missing_flag],
                                         ['Numeric Imputer ', numeric_imputation],
                                         ['Categorical Imputer ', categorical_imputation],
@@ -834,7 +924,7 @@ def setup(data,
            '''   
            
            #log into experiment
-            experiment__.append(('Info', functions))
+            experiment__.append(('Classification Setup Config', functions))
            experiment__.append(('X_training Set', X_train))
            experiment__.append(('y_training Set', y_train))
            experiment__.append(('X_test Set', X_test))
@@ -886,6 +976,12 @@ def setup(data,
                                         ['Combine Rare Levels ', combine_rare_levels],
                                         ['Rare Level Threshold ', rare_level_threshold_grid],
                                         ['Numeric Binning ', numeric_bin_grid],
+                                         ['Remove Outliers ', remove_outliers],
+                                         ['Outliers Threshold ', outliers_threshold_grid],
+                                         ['Remove Multicollinearity ', remove_multicollinearity],
+                                         ['Multicollinearity Threshold ', multicollinearity_threshold_grid],
+                                         ['Clustering ', create_clusters],
+                                         ['Clustering Iteration ', cluster_iter_grid],
                                         ['Missing Values ', missing_flag],
                                         ['Numeric Imputer ', numeric_imputation],
                                         ['Categorical Imputer ', categorical_imputation],
@@ -908,7 +1004,7 @@ def setup(data,
            ''' 
            
            #log into experiment
-            experiment__.append(('Info', functions))
+            experiment__.append(('Classification Setup Config', functions))
            experiment__.append(('X_training Set', X_train))
            experiment__.append(('y_training Set', y_train))
            experiment__.append(('X_test Set', X_test))
@@ -955,6 +1051,12 @@ def setup(data,
                                     ['Combine Rare Levels ', combine_rare_levels],
                                     ['Rare Level Threshold ', rare_level_threshold_grid],
                                     ['Numeric Binning ', numeric_bin_grid],
+                                     ['Remove Outliers ', remove_outliers],
+                                     ['Outliers Threshold ', outliers_threshold_grid],
+                                     ['Remove Multicollinearity ', remove_multicollinearity],
+                                     ['Multicollinearity Threshold ', multicollinearity_threshold_grid],
+                                     ['Clustering ', create_clusters],
+                                     ['Clustering Iteration ', cluster_iter_grid],
                                     ['Missing Values ', missing_flag],
                                     ['Numeric Imputer ', numeric_imputation],
                                     ['Categorical Imputer ', categorical_imputation],
@@ -977,7 +1079,7 @@ def setup(data,
        '''   
        
        #log into experiment
-        experiment__.append(('Classification Info', functions))
+        experiment__.append(('Classification Setup Config', functions))
        experiment__.append(('X_training Set', X_train))
        experiment__.append(('y_training Set', y_train))
        experiment__.append(('X_test Set', X_test))
@@ -988,6 +1090,7 @@ def setup(data,



+
 def create_model(estimator = None, 
                 ensemble = False, 
                 method = None, 
@@ -3373,7 +3476,7 @@ def tune_model(estimator = None,
        
        from sklearn.tree import DecisionTreeClassifier
        
-        param_grid = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
+        param_grid = {"max_depth": np.random.randint(1, (len(X_train.columns)*.85),4),
                  "max_features": np.random.randint(3, len(X_train.columns),4),
                  "min_samples_leaf": [2,3,4],
                  "criterion": ["gini", "entropy"]}
@@ -7353,6 +7456,8 @@ def deploy_model(model,
       
    Description:
    ------------
+    (In Preview)
+
    This function deploys the transformation pipeline and trained model object for
    production use. The platform of deployment can be defined under the platform
    param along with the applicable authentication tokens which are passed as a

--- a/clustering.py
+++ b/clustering.py
@@ -2032,7 +2032,7 @@ def tune_model(model=None,
    


-def plot_model(model, plot='cluster', feature=None):
+def plot_model(model, plot='cluster', feature = None, label = False):
    
    
    """
@@ -2072,8 +2072,13 @@ def plot_model(model, plot='cluster', feature=None):
    Distribution Plot              'distribution'
    
    feature : string, default = None
-    Name of feature column for x-axis of distribution plot. It only comes in effect
-    when plot = 'distribution'.
+    Name of feature column for x-axis of when plot = 'distribution'. When plot is
+    'cluster' or 'tsne' feature column is used as a hoverover tooltip and/or label
+    when label is set to True. If no feature name is passed in 'cluster' or 'tsne'
+    by default the first of column of dataset is chosen as hoverover tooltip.
+    
+    label : bool, default = False
+    When set to True, data labels are shown in 'cluster' and 'tsne' plot.
    
    Returns:
    --------
@@ -2087,7 +2092,7 @@ def plot_model(model, plot='cluster', feature=None):
              

    """  
-        
+    
    #exception checking   
    import sys
    
@@ -2099,7 +2104,17 @@ def plot_model(model, plot='cluster', feature=None):
    allowed_plots = ['cluster', 'tsne', 'elbow', 'silhouette', 'distance', 'distribution']  
    if plot not in allowed_plots:
        sys.exit('(Value Error): Plot Not Available. Please see docstring for list of available plots.')
-     
+        
+    if type(label) is not bool:
+        sys.exit('(Type Error): Label param only accepts True or False. ')
+        
+    if feature is not None:
+        if type(feature) is not str:
+            sys.exit('(Type Error): feature parameter must be string containing column name of dataset. ') 
+    
+    
+    
+    
    #specific disallowed plots
    
    """
@@ -2161,7 +2176,18 @@ def plot_model(model, plot='cluster', feature=None):
        pca_ = pca_.rename(columns={0: "PCA1", 1: "PCA2"})
        pca_['Cluster'] = cluster
        
-        fig = px.scatter(pca_, x="PCA1", y="PCA2", color='Cluster', opacity=0.5)
+        if feature is not None: 
+            pca_['Feature'] = data_[feature]
+        else:
+            pca_['Feature'] = data_[data_.columns[0]]
+            
+        if label:
+                pca_['Label'] = pca_['Feature']
+
+        if label:
+            fig = px.scatter(pca_, x="PCA1", y="PCA2", text='Label', color='Cluster', opacity=0.5)
+        else:
+            fig = px.scatter(pca_, x="PCA1", y="PCA2", hover_data=['Feature'], color='Cluster', opacity=0.5)

        fig.update_traces(textposition='top center')
        fig.update_layout(plot_bgcolor='rgb(240,240,240)')
@@ -2210,10 +2236,26 @@ def plot_model(model, plot='cluster', feature=None):
        X_embedded = pd.DataFrame(X_embedded)
        X_embedded['Cluster'] = cluster
        
+        if feature is not None: 
+            X_embedded['Feature'] = data_[feature]
+        else:
+            X_embedded['Feature'] = data_[data_.columns[0]]
+            
+        if label:
+                X_embedded['Label'] = X_embedded['Feature']
+                
        import plotly.express as px
        df = X_embedded
-        fig = px.scatter_3d(df, x=0, y=1, z=2,
-                      color='Cluster', title='3d TSNE Plot for Clusters', opacity=0.7, width=900, height=800)
+        
+        if label:
+            
+            fig = px.scatter_3d(df, x=0, y=1, z=2, color='Cluster', title='3d TSNE Plot for Clusters', 
+                    text = 'Label', opacity=0.7, width=900, height=800)
+            
+        else:
+            fig = px.scatter_3d(df, x=0, y=1, z=2, color='Cluster', title='3d TSNE Plot for Clusters', 
+                                hover_data = ['Feature'], opacity=0.7, width=900, height=800)
+        
        fig.show()
        
        
@@ -2296,6 +2338,8 @@ def plot_model(model, plot='cluster', feature=None):
            sys.exit('(Type Error): Plot Type not supported for this model.')


+
+
 def save_model(model, model_name, verbose=True):
    
    """
@@ -2683,6 +2727,8 @@ def deploy_model(model,
       
    Description:
    ------------
+    (In Preview)
+
    This function deploys the transformation pipeline and trained model object for
    production use. The platform of deployment can be defined under the platform
    param along with the applicable authentication tokens which are passed as a
@@ -2808,7 +2854,7 @@ def get_clusters(data,
                                       Power_transform_data = transformation,
                                       Power_transform_method = 'yj',
                                       apply_pca = pca,
-                                       pca_variance_retained=pca_components,
+                                       pca_variance_retained_or_number_of_components=pca_components,
                                       random_state = seed)
    
    
@@ -2817,5 +2863,4 @@ def get_clusters(data,
    except:
        c = create_model(model=model, verbose=False)
    dataset = assign_model(c, verbose=False)
-    return dataset
-
+    return dataset
\ No newline at end of file
--- a/regression.py
+++ b/regression.py