Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Pytorch Widedeep
提交
cbac7553
P
Pytorch Widedeep
项目概览
Greenplum
/
Pytorch Widedeep
11 个月 前同步成功
通知
9
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Pytorch Widedeep
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cbac7553
编写于
10月 10, 2019
作者:
J
jrzaurin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactored code to a class with fit and transform methods
上级
7004b16c
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
152 addition
and
34 deletion
+152
-34
pytorch_widedeep/utils/deep_utils.py
pytorch_widedeep/utils/deep_utils.py
+152
-34
未找到文件。
pytorch_widedeep/utils/deep_utils.py
浏览文件 @
cbac7553
import
numpy
as
np
import
pandas
as
pd
import
warnings
from
sklearn.preprocessing
import
StandardScaler
...
...
@@ -8,50 +9,17 @@ from ..wdtypes import *
pd
.
options
.
mode
.
chained_assignment
=
None
def
prepare_deep
(
df
:
pd
.
DataFrame
,
embed_cols
:
List
[
Union
[
str
,
Tuple
[
str
,
int
]]],
continuous_cols
:
List
[
str
],
already_standard
:
Optional
[
List
[
str
]]
=
None
,
scale
:
bool
=
True
,
default_embed_dim
:
int
=
8
):
if
isinstance
(
embed_cols
[
0
],
Tuple
):
embed_dim
=
dict
(
embed_cols
)
embed_coln
=
[
emb
[
0
]
for
emb
in
embed_cols
]
else
:
embed_dim
=
{
e
:
default_embed_dim
for
e
in
embed_cols
}
embed_coln
=
embed_cols
deep_cols
=
embed_coln
+
continuous_cols
df_deep
=
df
.
copy
()[
deep_cols
]
df_deep
,
encoding_dict
=
label_encode
(
df_deep
,
cols
=
embed_coln
)
embeddings_input
=
[]
for
k
,
v
in
encoding_dict
.
items
():
embeddings_input
.
append
((
k
,
len
(
v
),
embed_dim
[
k
]))
deep_column_idx
=
{
k
:
v
for
v
,
k
in
enumerate
(
df_deep
.
columns
)}
if
scale
:
scaler
=
StandardScaler
()
if
already_standard
is
not
None
:
standardize_cols
=
[
c
for
c
in
continuous_cols
if
c
not
in
already_standard
]
else
:
standardize_cols
=
continuous_cols
for
cc
in
standardize_cols
:
df_deep
[
cc
]
=
scaler
.
fit_transform
(
df_deep
[
cc
].
values
.
reshape
(
-
1
,
1
).
astype
(
float
))
return
df_deep
.
values
,
embeddings_input
,
encoding_dict
,
deep_column_idx
def
label_encode
(
df_inp
:
pd
.
DataFrame
,
cols
:
Optional
[
List
[
str
]]
=
None
,
val_to_idx
:
Optional
[
Dict
[
str
,
Dict
[
str
,
int
]]]
=
None
):
df
=
df_inp
.
copy
()
if
cols
==
None
:
cols
=
list
(
df
.
select_dtypes
(
include
=
[
'object'
]).
columns
)
if
not
val_to_idx
:
val_types
=
dict
()
for
c
in
cols
:
val_types
[
c
]
=
df
[
c
].
unique
()
val_to_idx
=
dict
()
for
k
,
v
in
val_types
.
items
():
val_to_idx
[
k
]
=
{
o
:
i
for
i
,
o
in
enumerate
(
val_types
[
k
])}
...
...
@@ -59,4 +27,154 @@ def label_encode(df_inp:pd.DataFrame, cols:Optional[List[str]]=None,
for
k
,
v
in
val_to_idx
.
items
():
df
[
k
]
=
df
[
k
].
apply
(
lambda
x
:
v
[
x
])
return
df
,
val_to_idx
\ No newline at end of file
return
df
,
val_to_idx
class
PrepareDeep
(
object
):
def
__init__
(
self
,
embed_cols
:
List
[
Union
[
str
,
Tuple
[
str
,
int
]]]
=
None
,
continuous_cols
:
List
[
str
]
=
None
,
already_standard
:
Optional
[
List
[
str
]]
=
None
,
scale
:
bool
=
True
,
default_embed_dim
:
int
=
8
):
super
(
PrepareDeep
,
self
).
__init__
()
self
.
embed_cols
=
embed_cols
self
.
continuous_cols
=
continuous_cols
self
.
already_standard
=
already_standard
self
.
scale
=
scale
self
.
default_embed_dim
=
default_embed_dim
assert
(
self
.
embed_cols
is
not
None
)
or
(
self
.
continuous_cols
is
not
None
),
\
'Either the embedding columns or continuous columns must not be passed'
def
_prepare_embed
(
self
,
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
if
isinstance
(
self
.
embed_cols
[
0
],
Tuple
):
self
.
embed_dim
=
dict
(
self
.
embed_cols
)
embed_colname
=
[
emb
[
0
]
for
emb
in
self
.
embed_cols
]
else
:
self
.
embed_dim
=
{
e
:
self
.
default_embed_dim
for
e
in
self
.
embed_cols
}
embed_colname
=
self
.
embed_cols
return
df
.
copy
()[
embed_colname
]
def
_prepare_continuous
(
self
,
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
if
self
.
scale
:
if
self
.
already_standard
is
not
None
:
self
.
standardize_cols
=
[
c
for
c
in
self
.
continuous_cols
if
c
not
in
self
.
already_standard
]
else
:
self
.
standardize_cols
=
self
.
continuous_cols
return
df
.
copy
()[
self
.
continuous_cols
]
def
fit
(
self
,
df
:
pd
.
DataFrame
)
->
np
.
ndarray
:
if
self
.
embed_cols
is
not
None
:
df_deep
=
self
.
_prepare_embed
(
df
)
df_deep
,
self
.
encoding_dict
=
label_encode
(
df_deep
,
cols
=
df_deep
.
columns
.
tolist
())
self
.
embeddings_input
=
[]
for
k
,
v
in
self
.
encoding_dict
.
items
():
self
.
embeddings_input
.
append
((
k
,
len
(
v
),
self
.
embed_dim
[
k
]))
if
self
.
continuous_cols
is
not
None
:
df_cont
=
self
.
_prepare_continuous
(
df
)
if
self
.
scale
:
df_std
=
df_cont
[
self
.
standardize_cols
]
self
.
scaler
=
StandardScaler
().
fit
(
df_std
.
values
)
df_cont
[
self
.
standardize_cols
]
=
self
.
scaler
.
transform
(
df_std
.
values
)
else
:
warnings
.
warn
(
'Continuous columns will not be normalised'
)
try
:
df_deep
=
pd
.
concat
([
df_deep
,
df_cont
],
axis
=
1
)
except
:
df_deep
=
df_cont
.
copy
()
return
df_deep
.
values
def
transform
(
self
,
df
:
pd
.
DataFrame
)
->
np
.
ndarray
:
if
self
.
embed_cols
is
not
None
:
df_deep
=
self
.
_prepare_embed
(
df
)
df_deep
,
_
=
label_encode
(
df_deep
,
cols
=
df_deep
.
columns
.
tolist
(),
val_to_idx
=
self
.
encoding_dict
)
if
self
.
continuous_cols
is
not
None
:
df_cont
=
self
.
_prepare_continuous
(
df
)
if
self
.
scale
:
df_std
=
df_cont
[
self
.
standardize_cols
]
df_cont
[
self
.
standardize_cols
]
=
self
.
scaler
.
transform
(
df_std
.
values
)
try
:
df_deep
=
pd
.
concat
([
df_deep
,
df_cont
],
axis
=
1
)
except
:
df_deep
=
df_cont
.
copy
()
return
df_deep
.
values
def
fit_transform
(
self
,
df
:
pd
.
DataFrame
)
->
np
.
ndarray
:
return
self
.
fit
(
df
)
# def prepare_deep(df:pd.DataFrame,
# embed_cols:List[Union[str, Tuple[str,int]]]=None,
# cat_encodings:Optional[Dict[str,Dict[str,int]]]=None,
# continuous_cols:List[str]=None,
# already_standard:Optional[List[str]]=None, scale:bool=True,
# default_embed_dim:int=8):
# assert (embed_cols is not None) or (continuous_cols is not None), \
# 'Either the embedding columns or continuous columns must not be passed'
# # set the categorical columns that will be represented by embeddings
# if embed_cols is not None:
# if isinstance(embed_cols[0], Tuple):
# embed_dim = dict(embed_cols)
# embed_coln = [emb[0] for emb in embed_cols]
# else:
# embed_dim = {e:default_embed_dim for e in embed_cols}
# embed_coln = embed_cols
# df_deep = df.copy()[embed_coln]
# df_deep, encoding_dict = label_encode(df_deep, cols=embed_coln, val_to_idx=cat_encodings)
# embeddings_input = []
# for k,v in encoding_dict.items():
# embeddings_input.append((k, len(v), embed_dim[k]))
# else:
# embeddings_input, encoding_dict = None, None
# # set the continous columns
# if continuous_cols is not None:
# df_cont = df.copy()[continuous_cols]
# if scale:
# scaler = StandardScaler()
# if already_standard is not None:
# standardize_cols = [c for c in continuous_cols if c not in already_standard]
# else: standardize_cols = continuous_cols
# for cc in standardize_cols:
# df_cont[cc] = scaler.fit_transform(df_cont[cc].values.reshape(-1,1).astype(float))
# else:
# warnings.warn('Continuous columns will not be normalised')
# try:
# df_deep = pd.concat([df_deep, df_cont], axis=1)
# except:
# df_deep = df_cont.copy()
# if not 'scaler' in locals(): scaler=None
# deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
# return df_deep.values, embeddings_input, encoding_dict, deep_column_idx, scaler
# def label_encode(df_inp:pd.DataFrame, cols:Optional[List[str]]=None,
# val_to_idx:Optional[Dict[str,Dict[str,int]]]=None):
# df = df_inp.copy()
# if cols == None:
# cols = list(df.select_dtypes(include=['object']).columns)
# if not val_to_idx:
# val_types = dict()
# for c in cols:
# val_types[c] = df[c].unique()
# val_to_idx = dict()
# for k, v in val_types.items():
# val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
# for k, v in val_to_idx.items():
# df[k] = df[k].apply(lambda x: v[x])
# return df, val_to_idx
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录