Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDocCN
pycaret
提交
29be74fd
pycaret
项目概览
OpenDocCN
/
pycaret
通知
2
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
pycaret
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
29be74fd
编写于
7月 20, 2020
作者:
P
PyCaret
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
updated pycaret-nightly==0.28 part 1/3
上级
f4da92cd
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
155 addition
and
58 deletion
+155
-58
Dockerfile
Dockerfile
+11
-0
pycaret/anomaly.py
pycaret/anomaly.py
+35
-6
pycaret/clustering.py
pycaret/clustering.py
+90
-48
pycaret/nlp.py
pycaret/nlp.py
+19
-4
未找到文件。
Dockerfile
0 → 100644
浏览文件 @
29be74fd
FROM
python:3.7-slim
WORKDIR
/app
ADD
. /app
RUN
apt-get update
&&
apt-get
install
-y
libgomp1
RUN
pip
install
--trusted-host
pypi.python.org
-r
requirements.txt
CMD
pytest
\ No newline at end of file
pycaret/anomaly.py
浏览文件 @
29be74fd
...
...
@@ -2,7 +2,7 @@
# Author: Moez Ali <moez.ali@queensu.ca>
# License: MIT
# Release: PyCaret 2.0x
# Last modified :
14
/07/2020
# Last modified :
20
/07/2020
def
setup
(
data
,
categorical_features
=
None
,
...
...
@@ -1016,6 +1016,7 @@ def setup(data,
if
verbose
:
if
html_param
:
clear_output
()
print
(
'Setup Succesfully Completed!'
)
display
(
functions_
)
else
:
print
(
functions_
.
data
)
...
...
@@ -1239,7 +1240,7 @@ def create_model(model = None,
sys
.
exit
(
'(Value Error): Model Not Available. Please see docstring for list of available models.'
)
#checking fraction type:
if
type
(
fraction
)
is
not
float
:
if
fraction
<=
0
or
fraction
>=
1
:
sys
.
exit
(
'(Type Error): Fraction parameter can only take value as float between 0 to 1.'
)
#checking verbose parameter
...
...
@@ -1362,6 +1363,13 @@ def create_model(model = None,
model
=
SOS
(
contamination
=
fraction
,
**
kwargs
)
full_name
=
'Stochastic Outlier Selection'
else
:
def
get_model_name
(
e
):
return
str
(
e
).
split
(
"("
)[
0
]
model
==
model
full_name
=
get_model_name
(
model
)
logger
.
info
(
str
(
full_name
)
+
' Imported succesfully'
)
#monitor update
...
...
@@ -1665,6 +1673,7 @@ def tune_model(model=None,
method
=
'drop'
,
estimator
=
None
,
optimize
=
None
,
custom_grid
=
None
,
#added in pycaret 2.0.0
fold
=
10
,
verbose
=
True
):
#added in pycaret 2.0.0
...
...
@@ -1773,6 +1782,11 @@ def tune_model(model=None,
optimize: string, default = None
custom_grid: list, default = None
By default, a pre-defined list of fraction values is iterated over to
optimize the supervised objective. To overwrite default iteration,
pass a list of fraction value to iterate over in custom_grid param.
For Classification tasks:
Accuracy, AUC, Recall, Precision, F1, Kappa
...
...
@@ -1884,7 +1898,11 @@ def tune_model(model=None,
import
datetime
,
time
#progress bar
max_steps
=
25
if
custom_grid
is
None
:
max_steps
=
25
else
:
max_steps
=
15
+
len
(
custom_grid
)
progress
=
ipw
.
IntProgress
(
value
=
0
,
min
=
0
,
max
=
max_steps
,
step
=
1
,
description
=
'Processing: '
)
if
verbose
:
...
...
@@ -1994,8 +2012,19 @@ def tune_model(model=None,
progress
.
value
+=
1
#defining tuning grid
param_grid_with_zero
=
[
0
,
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.10
]
param_grid
=
[
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.10
]
if
custom_grid
is
not
None
:
param_grid
=
custom_grid
param_grid_with_zero
=
[
0
]
for
i
in
param_grid
:
param_grid_with_zero
.
append
(
i
)
else
:
param_grid_with_zero
=
[
0
,
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.10
]
param_grid
=
[
0.01
,
0.02
,
0.03
,
0.04
,
0.05
,
0.06
,
0.07
,
0.08
,
0.09
,
0.10
]
master
=
[];
master_df
=
[]
...
...
@@ -2199,7 +2228,7 @@ def tune_model(model=None,
supervised
=
True
,
supervised_target
=
supervised_target
,
session_id
=
seed
,
log
ging
=
False
,
#added in pycaret==2.0.0
log
_experiment
=
False
,
#added in pycaret==2.0.0
profile
=
False
,
verbose
=
False
)
...
...
pycaret/clustering.py
浏览文件 @
29be74fd
...
...
@@ -2,7 +2,7 @@
# Author: Moez Ali <moez.ali@queensu.ca>
# License: MIT
# Release: PyCaret 2.0x
# Last modified :
14
/07/2020
# Last modified :
20
/07/2020
def
setup
(
data
,
categorical_features
=
None
,
...
...
@@ -27,7 +27,7 @@ def setup(data,
rare_level_threshold
=
0.10
,
bin_numeric_features
=
None
,
remove_multicollinearity
=
False
,
multicollinearity_threshold
=
0.9
,
multicollinearity_threshold
=
0.9
,
group_features
=
None
,
group_names
=
None
,
supervised
=
False
,
...
...
@@ -40,8 +40,8 @@ def setup(data,
log_plots
=
False
,
#added in pycaret==2.0.0
log_profile
=
False
,
#added in pycaret==2.0.0
log_data
=
False
,
#added in pycaret==2.0.0
silent
=
False
,
#added in pycaret==2.0.0
verbose
=
True
,
silent
=
False
,
#added in pycaret==2.0.0
verbose
=
True
,
profile
=
False
,):
"""
...
...
@@ -252,13 +252,13 @@ def setup(data,
unique number is then distributed as a seed in all functions used during the
experiment. This can be used for later reproducibility of the entire experiment.
experiment_name: str, default = None
Name of experiment for logging. When set to None, 'clf' is by default used as
alias for the experiment name.
log_experiment: bool, default = True
When set to True, all metrics and parameters are logged on MLFlow server.
experiment_name: str, default = None
Name of experiment for logging. When set to None, 'clu' is by default used as
alias for the experiment name.
log_plots: bool, default = False
When set to True, specific plots are logged in MLflow as a png file. By default,
it is set to False.
...
...
@@ -267,6 +267,9 @@ def setup(data,
When set to True, data profile is also logged on MLflow as a html file. By default,
it is set to False.
log_data: bool, default = False
When set to True, train and test dataset are logged as csv.
silent: bool, default = False
When set to True, confirmation of data types is not required. All preprocessing will
be performed assuming automatically inferred data types. Not recommended for direct use
...
...
@@ -826,7 +829,7 @@ def setup(data,
apply_binning
=
apply_binning_pass
,
features_to_binn
=
features_to_bin_pass
,
remove_multicollinearity
=
remove_multicollinearity
,
maximum_correlation_between_features
=
multicollinearity_threshold
,
maximum_correlation_between_features
=
multicollinearity_threshold
,
apply_grouping
=
apply_grouping_pass
,
features_to_group_ListofList
=
group_features_pass
,
group_name
=
group_names_pass
,
...
...
@@ -1012,8 +1015,10 @@ def setup(data,
if
verbose
:
if
html_param
:
clear_output
()
print
(
'Setup Succesfully Completed!'
)
display
(
functions_
)
else
:
print
(
'Setup Succesfully Completed!'
)
print
(
functions_
.
data
)
if
profile
:
...
...
@@ -1163,24 +1168,31 @@ def create_model(model = None,
Parameters
----------
model : string, default = None
Enter abbreviated string of the model class. List of available models supported:
Model Abbreviated String Original Implementation
--------- ------------------ -----------------------
K-Means Clustering 'kmeans' sklearn.cluster.KMeans.html
Affinity Propagation 'ap' AffinityPropagation.html
Mean shift Clustering 'meanshift' sklearn.cluster.MeanShift.html
Spectral Clustering 'sc' SpectralClustering.html
Agglomerative Clustering 'hclust' AgglomerativeClustering.html
Density-Based Spatial Clustering 'dbscan' sklearn.cluster.DBSCAN.html
OPTICS Clustering 'optics' sklearn.cluster.OPTICS.html
Birch Clustering 'birch' sklearn.cluster.Birch.html
K-Modes Clustering 'kmodes' git/nicodv/kmodes
model : string / object, default = None
Enter ID of the models available in model library or pass an untrained model
object consistent with fit / predict API to train and evaluate model. List of
models available in model library:
ID Name
------ -----------
'kmeans' K-Means Clustering
'ap' Affinity Propagation
'meanshift' Mean shift Clustering
'sc' Spectral Clustering
'hclust' Agglomerative Clustering
'dbscan' Density-Based Spatial Clustering
'optics' OPTICS Clustering
'birch' Birch Clustering
'kmodes' K-Modes Clustering
num_clusters: int, default = None
Number of clusters to be generated with the dataset. If None, num_clusters is set to 4.
Number of clusters to be generated with the dataset. If None, num_clusters
is set to 4.
ground_truth: string, default = None
When ground_truth is provided, Homogeneity Score, Rand Index, and
Completeness Score is evaluated and printer along with other metrics.
verbose: Boolean, default = True
Status update is not printed when verbose is set to False.
...
...
@@ -1194,7 +1206,12 @@ def create_model(model = None,
Returns:
--------
model: trained model object
score grid: A table containing the Silhouette, Calinski-Harabasz,
----------- Davies-Bouldin, Homogeneity Score, Rand Index, and
Completeness Score. Last 3 are only evaluated when
ground_truth param is provided.
model: trained model object
------
Warnings:
...
...
@@ -1242,18 +1259,19 @@ def create_model(model = None,
#checking for allowed models
allowed_models
=
[
'kmeans'
,
'ap'
,
'meanshift'
,
'sc'
,
'hclust'
,
'dbscan'
,
'optics'
,
'birch'
,
'kmodes'
]
if
type
(
model
)
is
str
:
if
model
not
in
allowed_models
:
sys
.
exit
(
'(Value Error): Model Not Available. Please see docstring for list of available models.'
)
#check num_clusters parameter:
if
num_clusters
is
not
None
:
no_num_required
=
[
'ap'
,
'meanshift'
,
'dbscan'
,
'optics'
]
if
model
in
no_num_required
:
sys
.
exit
(
'(Value Error): num_clusters parameter not required for specified model. Remove num_clusters to run this model.'
)
if
model
not
in
allowed_models
:
sys
.
exit
(
'(Value Error): Model Not Available. Please see docstring for list of available models.'
)
#checking num_clusters type:
if
num_clusters
is
not
None
:
if
type
(
num_clusters
)
is
not
int
:
if
num_clusters
<=
1
:
sys
.
exit
(
'(Type Error): num_clusters parameter can only take value integer value greater than 1.'
)
#check ground truth exist in data_
...
...
@@ -1355,6 +1373,13 @@ def create_model(model = None,
from
kmodes.kmodes
import
KModes
model
=
KModes
(
n_clusters
=
num_clusters
,
n_jobs
=
n_jobs_param
,
random_state
=
seed
,
**
kwargs
)
full_name
=
'K-Modes Clustering'
else
:
def
get_model_name
(
e
):
return
str
(
e
).
split
(
"("
)[
0
]
model
==
model
full_name
=
get_model_name
(
model
)
logger
.
info
(
str
(
full_name
)
+
' Imported succesfully'
)
...
...
@@ -1383,7 +1408,7 @@ def create_model(model = None,
try
:
silhouette
=
metrics
.
silhouette_score
(
X
,
model
.
labels_
)
silhouette
=
silhouette
.
round
(
4
)
silhouette
=
round
(
silhouette
,
4
)
metric
.
append
(
'Silhouette'
)
metric_value
.
append
(
silhouette
)
...
...
@@ -1392,7 +1417,7 @@ def create_model(model = None,
try
:
chs
=
metrics
.
calinski_harabasz_score
(
X
,
model
.
labels_
)
chs
=
chs
.
round
(
4
)
chs
=
round
(
chs
,
4
)
metric
.
append
(
'Calinski-Harabasz'
)
metric_value
.
append
(
chs
)
except
:
...
...
@@ -1400,7 +1425,7 @@ def create_model(model = None,
try
:
db
=
metrics
.
davies_bouldin_score
(
X
,
model
.
labels_
)
db
=
db
.
round
(
4
)
db
=
round
(
db
,
4
)
metric
.
append
(
'Davies-Bouldin'
)
metric_value
.
append
(
db
)
...
...
@@ -1415,7 +1440,7 @@ def create_model(model = None,
try
:
hs
=
metrics
.
homogeneity_score
(
gt
,
model
.
labels_
)
hs
=
hs
.
round
(
4
)
hs
=
round
(
hs
,
4
)
metric
.
append
(
'Homogeneity Score'
)
metric_value
.
append
(
hs
)
...
...
@@ -1424,15 +1449,16 @@ def create_model(model = None,
try
:
ari
=
metrics
.
adjusted_rand_score
(
gt
,
model
.
labels_
)
ari
=
ari
.
round
(
4
)
metric
.
append
(
'
ARI
'
)
ari
=
round
(
ari
,
4
)
metric
.
append
(
'
Rand Index
'
)
metric_value
.
append
(
ari
)
except
:
pass
try
:
cs
=
metrics
.
completeness_score
(
gt
,
model
.
labels_
)
cs
=
cs
.
round
(
4
)
cs
=
round
(
cs
,
4
)
metric
.
append
(
'Completeness Score'
)
metric_value
.
append
(
cs
)
except
:
...
...
@@ -1591,10 +1617,6 @@ def assign_model(model,
dataframe: Returns a dataframe with assigned clusters using a trained model.
---------
Warnings:
---------
None
"""
...
...
@@ -1731,6 +1753,7 @@ def tune_model(model=None,
supervised_target
=
None
,
estimator
=
None
,
optimize
=
None
,
custom_grid
=
None
,
#added in pycaret 2.0.0
fold
=
10
,
verbose
=
True
):
#added in pycaret 2.0.0
...
...
@@ -1826,6 +1849,11 @@ def tune_model(model=None,
optimize: string, default = None
custom_grid: list, default = None
By default, a pre-defined number of clusters is iterated over to
optimize the supervised objective. To overwrite default iteration,
pass a list of num_clusters to iterate over in custom_grid param.
For Classification tasks:
Accuracy, AUC, Recall, Precision, F1, Kappa
...
...
@@ -1934,7 +1962,10 @@ def tune_model(model=None,
import
datetime
,
time
#progress bar
max_steps
=
25
if
custom_grid
is
None
:
max_steps
=
25
else
:
max_steps
=
15
+
len
(
custom_grid
)
progress
=
ipw
.
IntProgress
(
value
=
0
,
min
=
0
,
max
=
max_steps
,
step
=
1
,
description
=
'Processing: '
)
...
...
@@ -2036,9 +2067,20 @@ def tune_model(model=None,
progress
.
value
+=
1
#defining tuning grid
param_grid_with_zero
=
[
0
,
4
,
5
,
6
,
8
,
10
,
14
,
18
,
25
,
30
,
40
]
param_grid
=
[
4
,
5
,
6
,
8
,
10
,
14
,
18
,
25
,
30
,
40
]
if
custom_grid
is
not
None
:
param_grid
=
custom_grid
param_grid_with_zero
=
[
0
]
for
i
in
param_grid
:
param_grid_with_zero
.
append
(
i
)
else
:
param_grid
=
[
4
,
5
,
6
,
8
,
10
,
14
,
18
,
25
,
30
,
40
]
param_grid_with_zero
=
[
0
,
4
,
5
,
6
,
8
,
10
,
14
,
18
,
25
,
30
,
40
]
master
=
[];
master_df
=
[]
monitor
.
iloc
[
1
,
1
:]
=
'Creating Clustering Model'
...
...
@@ -2236,13 +2278,13 @@ def tune_model(model=None,
rare_level_threshold
=
combine_rare_threshold_pass
,
bin_numeric_features
=
features_to_bin_pass
,
remove_multicollinearity
=
remove_multicollinearity_pass
,
multicollinearity_threshold
=
multicollinearity_threshold_pass
,
multicollinearity_threshold
=
multicollinearity_threshold_pass
,
group_features
=
group_features_pass
,
group_names
=
group_names_pass
,
supervised
=
True
,
supervised_target
=
supervised_target
,
session_id
=
seed
,
log
ging
=
False
,
#added in pycaret==2.0.0
log
_experiment
=
False
,
#added in pycaret==2.0.0
profile
=
False
,
verbose
=
False
)
...
...
pycaret/nlp.py
浏览文件 @
29be74fd
...
...
@@ -2,7 +2,7 @@
# Author: Moez Ali <moez.ali@queensu.ca>
# License: MIT
# Release: PyCaret 2.0x
# Last modified :
09
/07/2020
# Last modified :
20
/07/2020
def
setup
(
data
,
target
=
None
,
...
...
@@ -811,7 +811,7 @@ def create_model(model=None,
#checking round parameter
if
num_topics
is
not
None
:
if
type
(
num_topics
)
is
not
int
:
if
num_topics
<=
1
:
sys
.
exit
(
'(Type Error): num_topics parameter only accepts integer value.'
)
#checking verbose parameter
...
...
@@ -2002,6 +2002,7 @@ def tune_model(model=None,
supervised_target
=
None
,
estimator
=
None
,
optimize
=
None
,
custom_grid
=
None
,
#added in pycaret 2.0.0
auto_fe
=
True
,
fold
=
10
,
verbose
=
True
):
#added in pycaret==2.0.0
...
...
@@ -2103,6 +2104,11 @@ def tune_model(model=None,
optimize: string, default = None
custom_grid: list, default = None
By default, a pre-defined number of topics is iterated over to
optimize the supervised objective. To overwrite default iteration,
pass a list of num_topics to iterate over in custom_grid param.
For Classification tasks:
Accuracy, AUC, Recall, Precision, F1, Kappa
...
...
@@ -2230,7 +2236,11 @@ def tune_model(model=None,
import
datetime
,
time
#progress bar
max_steps
=
25
if
custom_grid
is
None
:
max_steps
=
25
else
:
max_steps
=
10
+
len
(
custom_grid
)
progress
=
ipw
.
IntProgress
(
value
=
0
,
min
=
0
,
max
=
max_steps
,
step
=
1
,
description
=
'Processing: '
)
if
verbose
:
if
html_param
:
...
...
@@ -2343,7 +2353,12 @@ def tune_model(model=None,
progress
.
value
+=
1
#defining tuning grid
param_grid
=
[
2
,
4
,
8
,
16
,
32
,
64
,
100
,
200
,
300
,
400
]
if
custom_grid
is
not
None
:
param_grid
=
custom_grid
else
:
param_grid
=
[
2
,
4
,
8
,
16
,
32
,
64
,
100
,
200
,
300
,
400
]
master
=
[];
master_df
=
[]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录