未验证 提交 de62af2c 编写于 作者: P PyCaret 提交者: GitHub

Add files via upload

上级 136558c8
......@@ -2095,7 +2095,8 @@ def tune_model(model=None,
def plot_model(model,
plot = 'tsne'):
plot = 'tsne',
feature = None):
"""
......@@ -2129,6 +2130,9 @@ def plot_model(model,
t-SNE (3d) Dimension Plot 'tsne'
UMAP Dimensionality Plot 'umap'
feature : string, default = None
feature column is used as a hoverover tooltip. By default, first of column of the
dataset is chosen as hoverover tooltip, when no feature is passed.
Returns:
--------
......@@ -2190,11 +2194,19 @@ def plot_model(model,
X = pd.DataFrame(X_embedded)
X['Label'] = Label
if feature is not None:
X['Feature'] = data_[feature]
else:
X['Feature'] = data_[data_.columns[0]]
import plotly.express as px
df = X
fig = px.scatter_3d(df, x=0, y=1, z=2,
color='Label', title='3d TSNE Plot for Outliers', opacity=0.7, width=900, height=800)
fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data=['Feature'], color='Label', title='3d TSNE Plot for Outliers',
opacity=0.7, width=900, height=800)
fig.show()
elif plot == 'umap':
......@@ -2213,8 +2225,15 @@ def plot_model(model,
import plotly.express as px
df = X
df['Label'] = Label
if feature is not None:
df['Feature'] = data_[feature]
else:
df['Feature'] = data_[data_.columns[0]]
fig = px.scatter(df, x=0, y=1,
color='Label', title='uMAP Plot for Outliers', opacity=0.7, width=900, height=800)
color='Label', title='uMAP Plot for Outliers', hover_data=['Feature'], opacity=0.7,
width=900, height=800)
fig.show()
......@@ -2577,6 +2596,8 @@ def deploy_model(model,
Description:
------------
(In Preview)
This function deploys the transformation pipeline and trained model object for
production use. The platform of deployment can be defined under the platform
param along with the applicable authentication tokens which are passed as a
......@@ -2704,7 +2725,7 @@ def get_outliers(data,
Power_transform_data = transformation,
Power_transform_method = 'yj',
apply_pca = pca,
pca_variance_retained=pca_components,
pca_variance_retained_or_number_of_components=pca_components,
random_state = seed)
......@@ -2713,5 +2734,4 @@ def get_outliers(data,
dataset = assign_model(c, verbose=False)
return dataset
return dataset
\ No newline at end of file
......@@ -25,6 +25,12 @@ def setup(data,
combine_rare_levels = False, #new
rare_level_threshold = 0.10, #new
bin_numeric_features = None, #new
remove_outliers = False, #new
outliers_threshold = 0.05, #new
remove_multicollinearity = False, #new
multicollinearity_threshold = 0.9, #new
create_clusters = False, #new
cluster_iter = 20, #new
session_id = None,
profile = False):
......@@ -163,8 +169,6 @@ def setup(data,
incremental : replacement for 'linear' pca when the dataset to be decomposed is
too large to fit in memory
pls : dimensionality reduction through supervised PLSregression technique.
pca_components: int/float, default = 0.99
Number of components to keep. if pca_components is a float, it is treated as
goal percentage for information retention. When pca_components param is integer
......@@ -195,6 +199,36 @@ def setup(data,
It is only optimal for gaussian data and underestimates number of bins for large
non-gaussian datasets.
remove_outliers: bool, default = False
When set to True, outliers from the training data is removed using ensemble of
Isolation Forest, K Nearest Neighbour and PCA Outlier detector. All of them are
unsupervised techniques. The contamination percentage is defined using the
outliers_threshold parameter.
outliers_threshold: float, default = 0.05
The percentage / proportion of outliers in the dataset can be defined using
outliers_threshold param. By default, 0.05 is used which means 0.025 on each
side of distribution tail is dropped from training data.
remove_multicollinearity: bool, default = False
When set to True, it drops the variables with inter-correlations higher than
the threshold defined under multicollinearity_threshold param. When two features
are highly correlated with each other, feature with less correlation with target
variable is dropped.
multicollinearity_threshold: float, default = 0.9
Threshold used for dropping the correlated features. Only comes into effect when
remove_multicollinearity is set to True.
create_clusters: bool, default = False
When set to True, an additional feature is created where each instance is assigned
to a cluster. Number of clusters is determined using combination of Calinski-Harabasz
Silhouette criterion.
cluster_iter: int, default = 20
Number of iterations for creating cluster. Each iteration represent cluster size.
Only comes into effect when create_clusters param is set to True.
session_id: int, default = None
If None, a random seed is generated and returned in the Information grid. The
unique number is then distributed as a seed in all functions used during the
......@@ -279,9 +313,9 @@ def setup(data,
sys.exit('(Type Error): PCA parameter only accepts True or False.')
#pca method check
allowed_pca_methods = ['linear', 'kernel', 'incremental', 'pls']
allowed_pca_methods = ['linear', 'kernel', 'incremental']
if pca_method not in allowed_pca_methods:
sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', 'incremental', or 'pls'. ")
sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', or 'incremental'. ")
#pca components check
if pca is True:
......@@ -326,6 +360,35 @@ def setup(data,
if i not in all_cols:
sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.")
#remove_outliers
if type(remove_outliers) is not bool:
sys.exit('(Type Error): remove_outliers parameter only accepts True or False.')
#outliers_threshold
if type(outliers_threshold) is not float:
sys.exit('(Type Error): outliers_threshold must be a float between 0 and 1. ')
#remove_multicollinearity
if type(remove_multicollinearity) is not bool:
sys.exit('(Type Error): remove_multicollinearity parameter only accepts True or False.')
#multicollinearity_threshold
if type(multicollinearity_threshold) is not float:
sys.exit('(Type Error): multicollinearity_threshold must be a float between 0 and 1. ')
#multicollinearity and multiclass check
if data[target].value_counts().count() > 2:
if remove_multicollinearity is True:
sys.exit('(Type Error): remove_multicollinearity cannot be used when target is multiclass. ')
#create_clusters
if type(create_clusters) is not bool:
sys.exit('(Type Error): create_clusters parameter only accepts True or False.')
#cluster_iter
if type(cluster_iter) is not int:
sys.exit('(Type Error): cluster_iter must be a integer greater than 1. ')
#cannot drop target
if ignore_features is not None:
if target in ignore_features:
......@@ -516,8 +579,14 @@ def setup(data,
rara_level_threshold_percentage = rare_level_threshold, #new
apply_binning = apply_binning_pass, #new
features_to_binn = features_to_bin_pass, #new
remove_outliers = remove_outliers, #new
outlier_contamination_percentage = outliers_threshold, #new
remove_multicollinearity = remove_multicollinearity, #new
maximum_correlation_between_features = multicollinearity_threshold, #new
cluster_entire_data = create_clusters, #new
range_of_clusters_to_try = cluster_iter, #new
display_types = True, #this is for inferred input box
target_transformation = False, #to be dealt later
target_transformation = False, #not needed for classification
random_state = seed)
progress.value += 1
......@@ -530,10 +599,10 @@ def setup(data,
else:
label_encoded = 'None'
res_type = ['quit','Quit','exit','EXIT','q','Q','e','E','QUIT','Exit']
res = preprocess.dtypes.response
if res in res_type:
sys.exit("(Process Exit): setup has been interupted with user command 'quit'. setup must rerun." )
......@@ -577,6 +646,21 @@ def setup(data,
else:
numeric_bin_grid = 'True'
if remove_outliers is False:
outliers_threshold_grid = None
else:
outliers_threshold_grid = outliers_threshold
if remove_multicollinearity is False:
multicollinearity_threshold_grid = None
else:
multicollinearity_threshold_grid = multicollinearity_threshold
if create_clusters is False:
cluster_iter_grid = None
else:
cluster_iter_grid = cluster_iter
learned_types = preprocess.dtypes.learent_dtypes
learned_types.drop(target, inplace=True)
......@@ -812,6 +896,12 @@ def setup(data,
['Combine Rare Levels ', combine_rare_levels],
['Rare Level Threshold ', rare_level_threshold_grid],
['Numeric Binning ', numeric_bin_grid],
['Remove Outliers ', remove_outliers],
['Outliers Threshold ', outliers_threshold_grid],
['Remove Multicollinearity ', remove_multicollinearity],
['Multicollinearity Threshold ', multicollinearity_threshold_grid],
['Clustering ', create_clusters],
['Clustering Iteration ', cluster_iter_grid],
['Missing Values ', missing_flag],
['Numeric Imputer ', numeric_imputation],
['Categorical Imputer ', categorical_imputation],
......@@ -834,7 +924,7 @@ def setup(data,
'''
#log into experiment
experiment__.append(('Info', functions))
experiment__.append(('Classification Setup Config', functions))
experiment__.append(('X_training Set', X_train))
experiment__.append(('y_training Set', y_train))
experiment__.append(('X_test Set', X_test))
......@@ -886,6 +976,12 @@ def setup(data,
['Combine Rare Levels ', combine_rare_levels],
['Rare Level Threshold ', rare_level_threshold_grid],
['Numeric Binning ', numeric_bin_grid],
['Remove Outliers ', remove_outliers],
['Outliers Threshold ', outliers_threshold_grid],
['Remove Multicollinearity ', remove_multicollinearity],
['Multicollinearity Threshold ', multicollinearity_threshold_grid],
['Clustering ', create_clusters],
['Clustering Iteration ', cluster_iter_grid],
['Missing Values ', missing_flag],
['Numeric Imputer ', numeric_imputation],
['Categorical Imputer ', categorical_imputation],
......@@ -908,7 +1004,7 @@ def setup(data,
'''
#log into experiment
experiment__.append(('Info', functions))
experiment__.append(('Classification Setup Config', functions))
experiment__.append(('X_training Set', X_train))
experiment__.append(('y_training Set', y_train))
experiment__.append(('X_test Set', X_test))
......@@ -955,6 +1051,12 @@ def setup(data,
['Combine Rare Levels ', combine_rare_levels],
['Rare Level Threshold ', rare_level_threshold_grid],
['Numeric Binning ', numeric_bin_grid],
['Remove Outliers ', remove_outliers],
['Outliers Threshold ', outliers_threshold_grid],
['Remove Multicollinearity ', remove_multicollinearity],
['Multicollinearity Threshold ', multicollinearity_threshold_grid],
['Clustering ', create_clusters],
['Clustering Iteration ', cluster_iter_grid],
['Missing Values ', missing_flag],
['Numeric Imputer ', numeric_imputation],
['Categorical Imputer ', categorical_imputation],
......@@ -977,7 +1079,7 @@ def setup(data,
'''
#log into experiment
experiment__.append(('Classification Info', functions))
experiment__.append(('Classification Setup Config', functions))
experiment__.append(('X_training Set', X_train))
experiment__.append(('y_training Set', y_train))
experiment__.append(('X_test Set', X_test))
......@@ -988,6 +1090,7 @@ def setup(data,
def create_model(estimator = None,
ensemble = False,
method = None,
......@@ -3373,7 +3476,7 @@ def tune_model(estimator = None,
from sklearn.tree import DecisionTreeClassifier
param_grid = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
param_grid = {"max_depth": np.random.randint(1, (len(X_train.columns)*.85),4),
"max_features": np.random.randint(3, len(X_train.columns),4),
"min_samples_leaf": [2,3,4],
"criterion": ["gini", "entropy"]}
......@@ -7353,6 +7456,8 @@ def deploy_model(model,
Description:
------------
(In Preview)
This function deploys the transformation pipeline and trained model object for
production use. The platform of deployment can be defined under the platform
param along with the applicable authentication tokens which are passed as a
......
......@@ -2032,7 +2032,7 @@ def tune_model(model=None,
def plot_model(model, plot='cluster', feature=None):
def plot_model(model, plot='cluster', feature = None, label = False):
"""
......@@ -2072,8 +2072,13 @@ def plot_model(model, plot='cluster', feature=None):
Distribution Plot 'distribution'
feature : string, default = None
Name of feature column for x-axis of distribution plot. It only comes in effect
when plot = 'distribution'.
Name of feature column for x-axis of when plot = 'distribution'. When plot is
'cluster' or 'tsne' feature column is used as a hoverover tooltip and/or label
when label is set to True. If no feature name is passed in 'cluster' or 'tsne'
by default the first of column of dataset is chosen as hoverover tooltip.
label : bool, default = False
When set to True, data labels are shown in 'cluster' and 'tsne' plot.
Returns:
--------
......@@ -2087,7 +2092,7 @@ def plot_model(model, plot='cluster', feature=None):
"""
#exception checking
import sys
......@@ -2099,7 +2104,17 @@ def plot_model(model, plot='cluster', feature=None):
allowed_plots = ['cluster', 'tsne', 'elbow', 'silhouette', 'distance', 'distribution']
if plot not in allowed_plots:
sys.exit('(Value Error): Plot Not Available. Please see docstring for list of available plots.')
if type(label) is not bool:
sys.exit('(Type Error): Label param only accepts True or False. ')
if feature is not None:
if type(feature) is not str:
sys.exit('(Type Error): feature parameter must be string containing column name of dataset. ')
#specific disallowed plots
"""
......@@ -2161,7 +2176,18 @@ def plot_model(model, plot='cluster', feature=None):
pca_ = pca_.rename(columns={0: "PCA1", 1: "PCA2"})
pca_['Cluster'] = cluster
fig = px.scatter(pca_, x="PCA1", y="PCA2", color='Cluster', opacity=0.5)
if feature is not None:
pca_['Feature'] = data_[feature]
else:
pca_['Feature'] = data_[data_.columns[0]]
if label:
pca_['Label'] = pca_['Feature']
if label:
fig = px.scatter(pca_, x="PCA1", y="PCA2", text='Label', color='Cluster', opacity=0.5)
else:
fig = px.scatter(pca_, x="PCA1", y="PCA2", hover_data=['Feature'], color='Cluster', opacity=0.5)
fig.update_traces(textposition='top center')
fig.update_layout(plot_bgcolor='rgb(240,240,240)')
......@@ -2210,10 +2236,26 @@ def plot_model(model, plot='cluster', feature=None):
X_embedded = pd.DataFrame(X_embedded)
X_embedded['Cluster'] = cluster
if feature is not None:
X_embedded['Feature'] = data_[feature]
else:
X_embedded['Feature'] = data_[data_.columns[0]]
if label:
X_embedded['Label'] = X_embedded['Feature']
import plotly.express as px
df = X_embedded
fig = px.scatter_3d(df, x=0, y=1, z=2,
color='Cluster', title='3d TSNE Plot for Clusters', opacity=0.7, width=900, height=800)
if label:
fig = px.scatter_3d(df, x=0, y=1, z=2, color='Cluster', title='3d TSNE Plot for Clusters',
text = 'Label', opacity=0.7, width=900, height=800)
else:
fig = px.scatter_3d(df, x=0, y=1, z=2, color='Cluster', title='3d TSNE Plot for Clusters',
hover_data = ['Feature'], opacity=0.7, width=900, height=800)
fig.show()
......@@ -2296,6 +2338,8 @@ def plot_model(model, plot='cluster', feature=None):
sys.exit('(Type Error): Plot Type not supported for this model.')
def save_model(model, model_name, verbose=True):
"""
......@@ -2683,6 +2727,8 @@ def deploy_model(model,
Description:
------------
(In Preview)
This function deploys the transformation pipeline and trained model object for
production use. The platform of deployment can be defined under the platform
param along with the applicable authentication tokens which are passed as a
......@@ -2808,7 +2854,7 @@ def get_clusters(data,
Power_transform_data = transformation,
Power_transform_method = 'yj',
apply_pca = pca,
pca_variance_retained=pca_components,
pca_variance_retained_or_number_of_components=pca_components,
random_state = seed)
......@@ -2817,5 +2863,4 @@ def get_clusters(data,
except:
c = create_model(model=model, verbose=False)
dataset = assign_model(c, verbose=False)
return dataset
return dataset
\ No newline at end of file
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册