Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
6ec031ff
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6ec031ff
编写于
8月 30, 2018
作者:
L
liuruilong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add dilation
上级
ee6ef4d9
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
765 addition
and
726 deletion
+765
-726
metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
...addle-mobile-demo/paddle-mobile-demo/ViewController.swift
+3
-2
metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+2
-0
metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
...l/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
...le-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Executor.swift
metal/paddle-mobile/paddle-mobile/Executor.swift
+9
-10
metal/paddle-mobile/paddle-mobile/Loader.swift
metal/paddle-mobile/paddle-mobile/Loader.swift
+158
-156
metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+7
-0
metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
...ddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+10
-3
metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
...dle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+10
-3
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
...mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
...obile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+8
-3
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
...le/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
...e-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+3
-1
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
...paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+1
-0
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
...addle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
...le/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+334
-331
metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+4
-2
metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+211
-210
未找到文件。
metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
浏览文件 @
6ec031ff
...
@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.
...
@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.
enum
SupportModel
:
String
{
enum
SupportModel
:
String
{
// case mobilenet = "mobilenet"
// case mobilenet = "mobilenet"
case
mobilenet_ssd
=
"mobilenetssd"
case
mobilenet_ssd
=
"mobilenetssd"
case
genet
=
"enet"
case
genet
=
"
g
enet"
static
func
supportedModels
()
->
[
SupportModel
]
{
static
func
supportedModels
()
->
[
SupportModel
]
{
//.mobilenet,
//.mobilenet,
return
[
.
mobilenet_ssd
,
.
genet
]
return
[
.
mobilenet_ssd
,
.
genet
]
...
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
...
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
return
return
}
}
do
{
do
{
let
max
=
1
let
max
=
1
0
let
startDate
=
Date
.
init
()
let
startDate
=
Date
.
init
()
for
i
in
0
..<
max
{
for
i
in
0
..<
max
{
try
net
.
predict
(
inTexture
:
inTexture
)
{
[
weak
self
]
(
result
)
in
try
net
.
predict
(
inTexture
:
inTexture
)
{
[
weak
self
]
(
result
)
in
...
@@ -87,6 +87,7 @@ class ViewController: UIViewController {
...
@@ -87,6 +87,7 @@ class ViewController: UIViewController {
fatalError
()
fatalError
()
}
}
print
(
result
.
resultArray
)
if
i
==
max
-
1
{
if
i
==
max
-
1
{
let
time
=
Date
.
init
()
.
timeIntervalSince
(
startDate
)
let
time
=
Date
.
init
()
.
timeIntervalSince
(
startDate
)
DispatchQueue
.
main
.
async
{
DispatchQueue
.
main
.
async
{
...
...
metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
浏览文件 @
6ec031ff
...
@@ -699,6 +699,7 @@
...
@@ -699,6 +699,7 @@
"@executable_path/Frameworks"
,
"@executable_path/Frameworks"
,
"@loader_path/Frameworks"
,
"@loader_path/Frameworks"
,
);
);
MACH_O_TYPE
=
mh_dylib
;
MTL_LANGUAGE_REVISION
=
UseDeploymentTarget
;
MTL_LANGUAGE_REVISION
=
UseDeploymentTarget
;
PRODUCT_BUNDLE_IDENTIFIER
=
"orange.paddle-mobile"
;
PRODUCT_BUNDLE_IDENTIFIER
=
"orange.paddle-mobile"
;
PRODUCT_NAME
=
"$(TARGET_NAME:c99extidentifier)"
;
PRODUCT_NAME
=
"$(TARGET_NAME:c99extidentifier)"
;
...
@@ -727,6 +728,7 @@
...
@@ -727,6 +728,7 @@
"@executable_path/Frameworks"
,
"@executable_path/Frameworks"
,
"@loader_path/Frameworks"
,
"@loader_path/Frameworks"
,
);
);
MACH_O_TYPE
=
mh_dylib
;
MTL_LANGUAGE_REVISION
=
UseDeploymentTarget
;
MTL_LANGUAGE_REVISION
=
UseDeploymentTarget
;
PRODUCT_BUNDLE_IDENTIFIER
=
"orange.paddle-mobile"
;
PRODUCT_BUNDLE_IDENTIFIER
=
"orange.paddle-mobile"
;
PRODUCT_NAME
=
"$(TARGET_NAME:c99extidentifier)"
;
PRODUCT_NAME
=
"$(TARGET_NAME:c99extidentifier)"
;
...
...
metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
浏览文件 @
6ec031ff
...
@@ -342,7 +342,7 @@ public extension MTLTexture {
...
@@ -342,7 +342,7 @@ public extension MTLTexture {
// n c h w - dim
// n c h w - dim
func
toTensor
(
dim
:
(
n
:
Int
,
c
:
Int
,
h
:
Int
,
w
:
Int
))
->
[
Float32
]
{
func
toTensor
(
dim
:
(
n
:
Int
,
c
:
Int
,
h
:
Int
,
w
:
Int
))
->
[
Float32
]
{
print
(
"origin dim:
\(
dim
)
"
)
//
print("origin dim: \(dim)")
print
(
"texture: "
)
print
(
"texture: "
)
print
(
self
)
print
(
self
)
...
...
metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
浏览文件 @
6ec031ff
...
@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
...
@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
let
offsetX
=
filterSize
.
width
/
2
-
paddings
.
0
let
offsetX
=
filterSize
.
width
/
2
-
paddings
.
0
let
offsetY
=
filterSize
.
height
/
2
-
paddings
.
1
let
offsetY
=
filterSize
.
height
/
2
-
paddings
.
1
let
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
0
,
strideX
:
UInt16
(
stride
.
0
),
strideY
:
UInt16
(
stride
.
1
),
paddedZ
:
UInt16
(
paddings
.
0
))
let
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
0
,
strideX
:
UInt16
(
stride
.
0
),
strideY
:
UInt16
(
stride
.
1
),
paddedZ
:
UInt16
(
paddings
.
0
)
,
dilationX
:
UInt16
(
1
),
dilationY
:
UInt16
(
1
)
)
let
param
=
ConvAddBatchNormReluTestParam
.
init
(
inInputTexture
:
inputeTexture
,
inOutputTexture
:
outputTexture
,
inMetalParam
:
metalParam
,
inFilterBuffer
:
filterBuffer
,
inBiaseBuffer
:
biaseBuffer
,
inNewScaleBuffer
:
newScalueBuffer
,
inNewBiaseBuffer
:
newBiaseBuffer
,
inFilterSize
:
filterSize
)
let
param
=
ConvAddBatchNormReluTestParam
.
init
(
inInputTexture
:
inputeTexture
,
inOutputTexture
:
outputTexture
,
inMetalParam
:
metalParam
,
inFilterBuffer
:
filterBuffer
,
inBiaseBuffer
:
biaseBuffer
,
inNewScaleBuffer
:
newScalueBuffer
,
inNewBiaseBuffer
:
newBiaseBuffer
,
inFilterSize
:
filterSize
)
...
...
metal/paddle-mobile/paddle-mobile/Executor.swift
浏览文件 @
6ec031ff
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
import
Foundation
import
Foundation
let
testTo
=
12
let
testTo
=
54
public
class
ResultHolder
<
P
:
PrecisionType
>
{
public
class
ResultHolder
<
P
:
PrecisionType
>
{
public
let
dim
:
[
Int
]
public
let
dim
:
[
Int
]
...
@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
...
@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
queue
=
inQueue
queue
=
inQueue
for
block
in
inProgram
.
programDesc
.
blocks
{
for
block
in
inProgram
.
programDesc
.
blocks
{
//block.ops.count
//block.ops.count
for
i
in
0
..<
testTo
{
for
i
in
0
..<
block
.
ops
.
count
{
let
op
=
block
.
ops
[
i
]
let
op
=
block
.
ops
[
i
]
do
{
do
{
let
op
=
try
OpCreator
<
P
>.
shared
.
creat
(
device
:
inDevice
,
opDesc
:
op
,
scope
:
inProgram
.
scope
)
let
op
=
try
OpCreator
<
P
>.
shared
.
creat
(
device
:
inDevice
,
opDesc
:
op
,
scope
:
inProgram
.
scope
)
...
@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> {
...
@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> {
// print(stridableInput)
// print(stridableInput)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true)
for
i
in
0
..<
self
.
ops
.
count
{
//
for i in 0..<self.ops.count {
let
op
=
self
.
ops
[
i
]
//
let op = self.ops[i]
print
(
" 第
\(
i
)
个 op: "
)
//
print(" 第 \(i) 个 op: ")
op
.
delogOutput
()
//
op.delogOutput()
}
//
}
//
return
//
return
let
afterDate
=
Date
.
init
()
let
afterDate
=
Date
.
init
()
...
@@ -145,7 +145,6 @@ public class Executor<P: PrecisionType> {
...
@@ -145,7 +145,6 @@ public class Executor<P: PrecisionType> {
return
p
return
p
}),
inElapsedTime
:
afterDate
.
timeIntervalSince
(
beforeDate
))
}),
inElapsedTime
:
afterDate
.
timeIntervalSince
(
beforeDate
))
}
}
completionHandle
(
resultHolder
)
completionHandle
(
resultHolder
)
}
}
...
...
metal/paddle-mobile/paddle-mobile/Loader.swift
浏览文件 @
6ec031ff
...
@@ -16,168 +16,170 @@ import Foundation
...
@@ -16,168 +16,170 @@ import Foundation
import
SwiftProtobuf
import
SwiftProtobuf
public
class
Loader
<
P
:
PrecisionType
>
{
public
class
Loader
<
P
:
PrecisionType
>
{
class
ParaLoader
{
class
ParaLoader
{
let
file
:
UnsafeMutablePointer
<
FILE
>
let
file
:
UnsafeMutablePointer
<
FILE
>
let
fileSize
:
Int
let
fileSize
:
Int
var
nowIndex
:
Int
var
nowIndex
:
Int
init
(
paramPath
:
String
)
throws
{
init
(
paramPath
:
String
)
throws
{
guard
let
tmpFile
=
fopen
(
paramPath
,
"rb"
)
else
{
guard
let
tmpFile
=
fopen
(
paramPath
,
"rb"
)
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"open param file error"
+
paramPath
)
throw
PaddleMobileError
.
loaderError
(
message
:
"open param file error"
+
paramPath
)
}
}
file
=
tmpFile
file
=
tmpFile
fseek
(
file
,
0
,
SEEK_END
)
fseek
(
file
,
0
,
SEEK_END
)
fileSize
=
ftell
(
file
)
fileSize
=
ftell
(
file
)
guard
fileSize
>
0
else
{
guard
fileSize
>
0
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"param file size is too small"
)
throw
PaddleMobileError
.
loaderError
(
message
:
"param file size is too small"
)
}
}
rewind
(
file
)
rewind
(
file
)
nowIndex
=
0
nowIndex
=
0
}
func
read
(
tensor
:
Tensor
<
P
>
)
throws
{
guard
nowIndex
<=
fileSize
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"out of the file range"
)
}
func
pointerReader
<
T
>
(
type
:
T
.
Type
)
->
T
{
let
ptr
=
UnsafeMutablePointer
<
T
>.
allocate
(
capacity
:
MemoryLayout
<
T
>.
size
)
fread
(
ptr
,
1
,
MemoryLayout
<
T
>.
size
,
file
)
nowIndex
+=
MemoryLayout
<
T
>.
size
let
pointee
=
ptr
.
pointee
ptr
.
deinitialize
(
count
:
MemoryLayout
<
UInt32
>.
size
)
ptr
.
deallocate
()
return
pointee
}
let
_
=
pointerReader
(
type
:
UInt32
.
self
)
let
lodLevel
=
pointerReader
(
type
:
UInt64
.
self
)
for
_
in
0
..<
lodLevel
{
let
size
=
pointerReader
(
type
:
UInt64
.
self
)
for
_
in
0
..<
Int
(
size
/
UInt64
(
MemoryLayout
<
size_t
>.
size
)){
_
=
pointerReader
(
type
:
size_t
.
self
)
}
}
let
_
=
pointerReader
(
type
:
UInt32
.
self
)
let
tensorDescSize
=
pointerReader
(
type
:
Int32
.
self
)
fseek
(
file
,
Int
(
tensorDescSize
),
SEEK_CUR
)
nowIndex
+=
Int
(
tensorDescSize
)
/*
这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
*/
//现在模型传入模型为 Float 类型, 这块应该根据模型来
// let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
// let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
let
bytesRead
=
fread
(
tensor
.
data
.
pointer
,
1
,
tensor
.
data
.
size
,
file
)
guard
bytesRead
==
tensor
.
data
.
size
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"param read size error"
)
}
// TODO: use script to convert
// let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
// for i in 0..<tensor.numel() {
// tensor.data[i] = P.init(inFloat: tmpPointer[i])
// }
// tmpPointer.deinitialize(count: tmpCapacity)
// tmpPointer.deallocate()
nowIndex
+=
bytesRead
}
deinit
{
fclose
(
file
)
}
}
}
public
init
(){}
public
func
load
(
device
:
MTLDevice
,
modelPath
:
String
,
paraPath
:
String
)
throws
->
Program
{
func
read
(
tensor
:
Tensor
<
P
>
)
throws
{
guard
let
modelData
=
try
?
Data
.
init
(
contentsOf
:
URL
.
init
(
fileURLWithPath
:
modelPath
))
else
{
guard
nowIndex
<=
fileSize
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"load "
+
modelPath
+
" failed !"
)
throw
PaddleMobileError
.
loaderError
(
message
:
"out of the file range"
)
}
func
pointerReader
<
T
>
(
type
:
T
.
Type
)
->
T
{
let
ptr
=
UnsafeMutablePointer
<
T
>.
allocate
(
capacity
:
MemoryLayout
<
T
>.
size
)
fread
(
ptr
,
1
,
MemoryLayout
<
T
>.
size
,
file
)
nowIndex
+=
MemoryLayout
<
T
>.
size
let
pointee
=
ptr
.
pointee
ptr
.
deinitialize
(
count
:
MemoryLayout
<
UInt32
>.
size
)
ptr
.
deallocate
()
return
pointee
}
let
_
=
pointerReader
(
type
:
UInt32
.
self
)
let
lodLevel
=
pointerReader
(
type
:
UInt64
.
self
)
for
_
in
0
..<
lodLevel
{
let
size
=
pointerReader
(
type
:
UInt64
.
self
)
for
_
in
0
..<
Int
(
size
/
UInt64
(
MemoryLayout
<
size_t
>.
size
)){
_
=
pointerReader
(
type
:
size_t
.
self
)
}
}
}
do
{
let
protoProgram
=
try
PaddleMobile_Framework_Proto_ProgramDesc
.
init
(
let
_
=
pointerReader
(
type
:
UInt32
.
self
)
serializedData
:
modelData
)
let
tensorDescSize
=
pointerReader
(
type
:
Int32
.
self
)
let
originProgramDesc
=
ProgramDesc
.
init
(
protoProgram
:
protoProgram
)
let
programDesc
=
ProgramOptimize
<
P
>.
init
()
.
optimize
(
originProgramDesc
:
originProgramDesc
)
fseek
(
file
,
Int
(
tensorDescSize
),
SEEK_CUR
)
print
(
programDesc
)
nowIndex
+=
Int
(
tensorDescSize
)
guard
let
paraLoader
=
try
?
ParaLoader
.
init
(
paramPath
:
paraPath
)
else
{
/*
throw
PaddleMobileError
.
loaderError
(
message
:
"load para error"
)
这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
}
*/
guard
programDesc
.
blocks
.
count
>
0
else
{
//现在模型传入模型为 Float 类型, 这块应该根据模型来
throw
PaddleMobileError
.
loaderError
(
message
:
"count of blocks must greater than 0"
)
// let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
}
// let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
let
bytesRead
=
fread
(
tensor
.
data
.
pointer
,
1
,
tensor
.
data
.
size
,
file
)
// to get feed key and fetch key
let
block
=
programDesc
.
blocks
[
0
]
guard
bytesRead
==
tensor
.
data
.
size
else
{
guard
let
firstOp
=
block
.
ops
.
first
,
let
lastOp
=
block
.
ops
.
last
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"param read size error"
)
throw
PaddleMobileError
.
loaderError
(
message
:
"at least two operator"
)
}
}
guard
firstOp
.
type
==
gFeedType
,
lastOp
.
type
==
gFetchType
else
{
// TODO: use script to convert
throw
PaddleMobileError
.
loaderError
(
message
:
"the first op is not feed or the last op is not fetch"
)
// let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
// for i in 0..<tensor.numel() {
// tensor.data[i] = P.init(inFloat: tmpPointer[i])
// }
// tmpPointer.deinitialize(count: tmpCapacity)
// tmpPointer.deallocate()
nowIndex
+=
bytesRead
}
deinit
{
fclose
(
file
)
}
}
public
init
(){}
public
func
load
(
device
:
MTLDevice
,
modelPath
:
String
,
paraPath
:
String
)
throws
->
Program
{
guard
let
modelData
=
try
?
Data
.
init
(
contentsOf
:
URL
.
init
(
fileURLWithPath
:
modelPath
))
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"load "
+
modelPath
+
" failed !"
)
}
do
{
let
protoProgram
=
try
PaddleMobile_Framework_Proto_ProgramDesc
.
init
(
serializedData
:
modelData
)
let
originProgramDesc
=
ProgramDesc
.
init
(
protoProgram
:
protoProgram
)
let
programDesc
=
ProgramOptimize
<
P
>.
init
()
.
optimize
(
originProgramDesc
:
originProgramDesc
)
print
(
programDesc
)
guard
let
paraLoader
=
try
?
ParaLoader
.
init
(
paramPath
:
paraPath
)
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"load para error"
)
}
guard
programDesc
.
blocks
.
count
>
0
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"count of blocks must greater than 0"
)
}
// to get feed key and fetch key
let
block
=
programDesc
.
blocks
[
0
]
guard
let
firstOp
=
block
.
ops
.
first
,
let
lastOp
=
block
.
ops
.
last
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"at least two operator"
)
}
guard
firstOp
.
type
==
gFeedType
,
lastOp
.
type
==
gFetchType
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"the first op is not feed or the last op is not fetch"
)
}
guard
let
inputKey
=
opInfos
[
gFeedType
]?
.
inputs
.
first
,
let
outKey
=
opInfos
[
gFetchType
]?
.
outputs
.
first
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"the feed input key or fetch output key not found"
)
}
guard
let
feedKey
=
firstOp
.
inputs
[
inputKey
]?
.
first
,
let
fetchKey
=
lastOp
.
outputs
[
outKey
]?
.
first
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"feed key or fetch key not found"
)
}
let
scope
=
Scope
.
init
(
inFeedKey
:
feedKey
,
inFetchKey
:
fetchKey
)
// to load memory
for
block
in
programDesc
.
blocks
{
for
varDesc
in
block
.
vars
{
if
(
varDesc
.
type
==
.
LodTensor
)
{
guard
let
tensorDesc
=
varDesc
.
tensorDesc
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"get tensor desc failed"
)
}
}
guard
let
inputKey
=
opInfos
[
gFeedType
]?
.
inputs
.
first
,
let
outKey
=
opInfos
[
gFetchType
]?
.
outputs
.
first
else
{
if
(
varDesc
.
persistable
throw
PaddleMobileError
.
loaderError
(
message
:
"the feed input key or fetch output key not found"
)
&&
varDesc
.
type
!=
.
FeedMiniBatch
}
&&
varDesc
.
type
!=
.
FetchList
)
{
guard
let
feedKey
=
firstOp
.
inputs
[
inputKey
]?
.
first
,
let
fetchKey
=
lastOp
.
outputs
[
outKey
]?
.
first
else
{
let
dimArr
=
tensorDesc
.
dims
throw
PaddleMobileError
.
loaderError
(
message
:
"feed key or fetch key not found"
)
guard
dimArr
.
count
>
0
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"tensor desc dim size error"
)
}
let
dim
=
Dim
.
init
(
inDim
:
dimArr
)
let
tensor
=
Tensor
<
P
>.
init
(
inDim
:
dim
,
inLayout
:
tensorDesc
.
dataLayout
)
do
{
try
paraLoader
.
read
(
tensor
:
tensor
)
}
catch
let
error
{
throw
error
}
tensor
.
convert
(
to
:
DataLayout
.
NHWC
())
// tensor.initBuffer(device: device)
scope
[
varDesc
.
name
]
=
tensor
}
else
{
let
dim
=
Dim
.
init
(
inDim
:
tensorDesc
.
dims
)
scope
[
varDesc
.
name
]
=
Texture
<
P
>.
init
(
device
:
device
,
inDim
:
dim
)
}
}
}
else
{
let
scope
=
Scope
.
init
(
inFeedKey
:
feedKey
,
inFetchKey
:
fetchKey
)
if
varDesc
.
name
==
fetchKey
{
scope
[
varDesc
.
name
]
=
ResultHolder
<
P
>.
init
(
inDim
:
[],
inResult
:
[],
inElapsedTime
:
0.0
)
// to load memory
}
else
if
varDesc
.
name
==
feedKey
{
for
block
in
programDesc
.
blocks
{
for
varDesc
in
block
.
vars
{
if
(
varDesc
.
type
==
.
LodTensor
)
{
guard
let
tensorDesc
=
varDesc
.
tensorDesc
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"get tensor desc failed"
)
}
if
(
varDesc
.
persistable
&&
varDesc
.
type
!=
.
FeedMiniBatch
&&
varDesc
.
type
!=
.
FetchList
)
{
let
dimArr
=
tensorDesc
.
dims
guard
dimArr
.
count
>
0
else
{
throw
PaddleMobileError
.
loaderError
(
message
:
"tensor desc dim size error"
)
}
let
dim
=
Dim
.
init
(
inDim
:
dimArr
)
let
tensor
=
Tensor
<
P
>.
init
(
inDim
:
dim
,
inLayout
:
tensorDesc
.
dataLayout
)
do
{
try
paraLoader
.
read
(
tensor
:
tensor
)
}
catch
let
error
{
throw
error
}
tensor
.
convert
(
to
:
DataLayout
.
NHWC
())
// tensor.initBuffer(device: device)
scope
[
varDesc
.
name
]
=
tensor
}
else
{
let
dim
=
Dim
.
init
(
inDim
:
tensorDesc
.
dims
)
scope
[
varDesc
.
name
]
=
Texture
<
P
>.
init
(
device
:
device
,
inDim
:
dim
)
}
}
else
{
if
varDesc
.
name
==
fetchKey
{
scope
[
varDesc
.
name
]
=
ResultHolder
<
P
>.
init
(
inDim
:
[],
inResult
:
[],
inElapsedTime
:
0.0
)
}
else
if
varDesc
.
name
==
feedKey
{
}
}
}
}
}
}
let
program
=
Program
.
init
(
inProgramDesc
:
programDesc
,
inParamPath
:
paraPath
,
inScope
:
scope
)
return
program
}
catch
_
{
throw
PaddleMobileError
.
loaderError
(
message
:
"protobuf decoder error"
)
}
}
}
let
program
=
Program
.
init
(
inProgramDesc
:
programDesc
,
inParamPath
:
paraPath
,
inScope
:
scope
)
return
program
}
catch
_
{
throw
PaddleMobileError
.
loaderError
(
message
:
"protobuf decoder error"
)
}
}
}
}
}
metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
浏览文件 @
6ec031ff
...
@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
...
@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
}
}
func
delogOutput
()
{
func
delogOutput
()
{
print
(
"stride: "
)
print
(
para
.
stride
)
print
(
"dilations: "
)
print
(
para
.
dilations
)
print
(
"
\(
type
)
output: "
)
print
(
"
\(
type
)
output: "
)
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
}
}
...
...
metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
浏览文件 @
6ec031ff
...
@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
...
@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
}
}
func
delogOutput
()
{
func
delogOutput
()
{
print
(
"conv transpose delog"
)
print
(
"
\(
type
)
output: "
)
let
_
:
P
?
=
para
.
input
.
metalTexture
.
logDesc
(
header
:
"conv transpose input: "
,
stridable
:
true
)
let
originDim
=
para
.
output
.
originDim
let
_
:
P
?
=
para
.
output
.
metalTexture
.
logDesc
(
header
:
"conv transpose output: "
,
stridable
:
true
)
if
para
.
output
.
transpose
==
[
0
,
1
,
2
,
3
]
{
let
outputArray
=
para
.
output
.
metalTexture
.
realNHWC
(
dim
:
(
n
:
originDim
[
0
],
h
:
originDim
[
1
],
w
:
originDim
[
2
],
c
:
originDim
[
3
]))
print
(
outputArray
.
strideArray
())
}
else
if
para
.
output
.
transpose
==
[
0
,
2
,
3
,
1
]
{
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
}
else
{
print
(
" not implement"
)
}
}
}
}
}
metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
浏览文件 @
6ec031ff
...
@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
...
@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
print
(
para
.
inputX
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
inputX
.
tensorDim
[
0
],
c
:
para
.
inputX
.
tensorDim
[
1
],
h
:
para
.
inputX
.
tensorDim
[
2
],
w
:
para
.
inputX
.
tensorDim
[
3
]))
.
strideArray
())
print
(
para
.
inputX
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
inputX
.
tensorDim
[
0
],
c
:
para
.
inputX
.
tensorDim
[
1
],
h
:
para
.
inputX
.
tensorDim
[
2
],
w
:
para
.
inputX
.
tensorDim
[
3
]))
.
strideArray
())
print
(
"
\(
type
)
inputY: "
)
print
(
"
\(
type
)
inputY: "
)
print
(
para
.
inputY
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
inputY
.
tensorDim
[
0
],
c
:
para
.
inputY
.
tensorDim
[
1
],
h
:
para
.
inputY
.
tensorDim
[
2
],
w
:
para
.
inputY
.
tensorDim
[
3
]))
.
strideArray
())
print
(
para
.
inputY
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
inputY
.
tensorDim
[
0
],
c
:
para
.
inputY
.
tensorDim
[
1
],
h
:
para
.
inputY
.
tensorDim
[
2
],
w
:
para
.
inputY
.
tensorDim
[
3
]))
.
strideArray
())
print
(
"
\(
type
)
output: "
)
print
(
"
\(
type
)
output: "
)
let
originDim
=
para
.
output
.
originDim
let
originDim
=
para
.
output
.
originDim
let
outputArray
=
para
.
output
.
metalTexture
.
realNHWC
(
dim
:
(
n
:
originDim
[
0
],
h
:
originDim
[
1
],
w
:
originDim
[
2
],
c
:
originDim
[
3
]))
if
para
.
output
.
transpose
==
[
0
,
1
,
2
,
3
]
{
print
(
outputArray
.
strideArray
())
let
outputArray
=
para
.
output
.
metalTexture
.
realNHWC
(
dim
:
(
n
:
originDim
[
0
],
h
:
originDim
[
1
],
w
:
originDim
[
2
],
c
:
originDim
[
3
]))
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
print
(
outputArray
.
strideArray
())
}
else
if
para
.
output
.
transpose
==
[
0
,
2
,
3
,
1
]
{
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
}
else
{
print
(
" not implement"
)
}
}
}
func
runImpl
(
device
:
MTLDevice
,
buffer
:
MTLCommandBuffer
)
throws
{
func
runImpl
(
device
:
MTLDevice
,
buffer
:
MTLCommandBuffer
)
throws
{
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
浏览文件 @
6ec031ff
...
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
...
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
print
(
"offset y:
\(
offsetY
)
"
)
print
(
"offset y:
\(
offsetY
)
"
)
let
offsetZ
=
0.0
let
offsetZ
=
0.0
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
]))
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
])
,
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
])
)
var
invs
:
[
P
]
=
[]
var
invs
:
[
P
]
=
[]
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
浏览文件 @
6ec031ff
...
@@ -27,9 +27,10 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
...
@@ -27,9 +27,10 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
[
0
,
2
,
3
,
1
])
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
[
0
,
2
,
3
,
1
])
let
offsetX
=
param
.
filter
.
width
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
width
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
param
.
filter
.
height
/
2
-
Int
(
param
.
paddings
[
1
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
height
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
param
.
y
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
param
.
y
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
...
@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
...
@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
print
(
"offset y:
\(
offsetY
)
"
)
print
(
"offset y:
\(
offsetY
)
"
)
let
offsetZ
=
0.0
let
offsetZ
=
0.0
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
]))
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]))
print
(
"metal param: "
)
print
(
inMetalParam
)
metalParam
=
inMetalParam
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvAddParam
<
P
>
)
throws
{
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvAddParam
<
P
>
)
throws
{
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
浏览文件 @
6ec031ff
...
@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
...
@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
let
offsetZ
=
0.0
let
offsetZ
=
0.0
print
(
" fuck "
)
print
(
" fuck "
)
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
]))
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
])
,
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
])
)
var
invs
:
[
P
]
=
[]
var
invs
:
[
P
]
=
[]
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
浏览文件 @
6ec031ff
...
@@ -21,6 +21,8 @@ public struct MetalConvParam {
...
@@ -21,6 +21,8 @@ public struct MetalConvParam {
let
strideX
:
UInt16
let
strideX
:
UInt16
let
strideY
:
UInt16
let
strideY
:
UInt16
let
paddedZ
:
UInt16
let
paddedZ
:
UInt16
let
dilationX
:
UInt16
let
dilationY
:
UInt16
}
}
class
ConvKernel
<
P
:
PrecisionType
>
:
Kernel
,
Computable
{
class
ConvKernel
<
P
:
PrecisionType
>
:
Kernel
,
Computable
{
...
@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
...
@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
let
offsetZ
=
0.0
let
offsetZ
=
0.0
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
Tensor
.
BufferPrecision
.
Float32
)
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
]))
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
paddedZ
:
UInt16
(
param
.
input
.
metalTexture
.
arrayLength
*
4
-
param
.
input
.
dim
[
3
])
,
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
])
)
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvParam
<
P
>
)
throws
{
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvParam
<
P
>
)
throws
{
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
浏览文件 @
6ec031ff
...
@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
...
@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
metalParam
=
MetalConvTransposeParam
.
init
(
kernelW
:
kernelWidth
,
kernelH
:
kernelHeight
,
strideX
:
strideX
,
strideY
:
strideY
,
paddingX
:
paddingX
,
paddingY
:
paddingY
,
dilationX
:
dilationX
,
dilationY
:
dilationY
)
metalParam
=
MetalConvTransposeParam
.
init
(
kernelW
:
kernelWidth
,
kernelH
:
kernelHeight
,
strideX
:
strideX
,
strideY
:
strideY
,
paddingX
:
paddingX
,
paddingY
:
paddingY
,
dilationX
:
dilationX
,
dilationY
:
dilationY
)
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
param
.
input
.
transpose
)
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
param
.
input
.
transpose
)
param
.
filter
.
initBuffer
(
device
:
device
)
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvTransposeParam
<
P
>
)
throws
{
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvTransposeParam
<
P
>
)
throws
{
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
浏览文件 @
6ec031ff
...
@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
...
@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
}
}
emp
.
yoff
=
4
-
Int32
(
param
.
inputY
.
tensorDim
.
cout
())
emp
.
yoff
=
4
-
Int32
(
param
.
inputY
.
tensorDim
.
cout
())
if
(
param
.
inputX
.
dim
==
param
.
inputY
.
dim
)
&&
(
param
.
inputX
.
transpose
==
param
.
inputY
.
transpose
)
{
if
(
param
.
inputX
.
dim
==
param
.
inputY
.
dim
)
&&
(
param
.
inputX
.
transpose
==
param
.
inputY
.
transpose
)
{
print
(
"===> elementwise_add fast!!!"
)
//
print("===> elementwise_add fast!!!")
emp
.
fast
=
1
emp
.
fast
=
1
}
}
...
...
metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
浏览文件 @
6ec031ff
...
@@ -16,198 +16,198 @@
...
@@ -16,198 +16,198 @@
using namespace metal;
using namespace metal;
struct MetalConvParam {
struct MetalConvParam {
short offsetX;
short offsetX;
short offsetY;
short offsetY;
short offsetZ;
short offsetZ;
ushort strideX;
ushort strideX;
ushort strideY;
ushort strideY;
ushort dilationX;
ushort dilationY;
};
};
kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 output = half4(0.0);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 input;
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) {
output.z += dot(input, weight_z);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
outTexture.write(output, gid.xy, gid.z);
output.w += dot(input, weight_w);
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
}
}
kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 output = half4(0.0);
half4 input[9];
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
output.w += dot(input[j], weight_w);
}
}
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
}
outTexture.write(output, gid.xy, gid.z);
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
}
}
kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
uint input_arr_size = inTexture.get_array_size()
;
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]
;
uint weithTo = gid.z * kernelHXW * input_arr_size * 4
;
output.y += dot(input, weight_y)
;
half4 output = half4(0.0);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 input;
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) {
output.w += dot(input, weight_w);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
}
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output = output + biase[gid.z];
output.x += dot(input, weight_x);
outTexture.write(output, gid.xy, gid.z);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
}
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
uint output_slice = gid.z;
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
uint weithTo = gid.z * kernelHXW * 4;
half4 output = half4(0.0);
half4 output = half4(0.0);
half4 inputs[9];
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
half4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
outTexture.write(output, gid.xy, gid.z);
}
}
...
@@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
...
@@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
if (gid.x >= outTexture.get_width() ||
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
gid.y >= outTexture.get_height() ||
output.y += dot(input, weight_y);
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 input;
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) {
output.w += dot(input, weight_w);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
}
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
output.x += dot(input, weight_x);
outTexture.write(output, gid.xy, gid.z);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
}
}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
...
@@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
...
@@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 output = float4(0.0);
float4 input[9];
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
output.w += dot(input[j], weight_w);
}
}
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
}
outTexture.write(output, gid.xy, gid.z);
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
}
}
kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
gid.z >= outTexture.get_array_size()) {
return;
return;
}
}
uint output_slice = gid.z;
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
float4 output = float4(0.0);
float4 inputs[9];
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
outTexture.write(output, gid.xy, gid.z);
}
}
// conv
// conv
...
@@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
...
@@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
}
}
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
@@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
...
@@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
}
}
kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
...
@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
float4 output = float4(0.0);
float4 output = float4(0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
float4 input[9];
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x -
1, posInInput.y - 1
), i);
input[0] = inTexture.sample(sample, float2(posInInput.x -
dilation_x, posInInput.y - dilation_y
), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y -
1
), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y -
dilation_y
), i);
input[2] = inTexture.sample(sample, float2(posInInput.x +
1, posInInput.y - 1
), i);
input[2] = inTexture.sample(sample, float2(posInInput.x +
dilation_x, posInInput.y - dilation_y
), i);
input[3] = inTexture.sample(sample, float2(posInInput.x -
1
, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x -
dilation_x
, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x +
1
, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x +
dilation_x
, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x -
1, posInInput.y + 1
), i);
input[6] = inTexture.sample(sample, float2(posInInput.x -
dilation_x, posInInput.y + dilation_y
), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y +
1
), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y +
dilation_y
), i);
input[8] = inTexture.sample(sample, float2(posInInput.x +
1, posInInput.y + 1
), i);
input[8] = inTexture.sample(sample, float2(posInInput.x +
dilation_x, posInInput.y + dilation_y
), i);
for (int j = 0; j < 9; ++j) {
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
output.x += dot(input[j], weight_x);
...
@@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
...
@@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
}
}
kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
@@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
...
@@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
#pragma mark - conv bn relu
#pragma mark - conv bn relu
kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]],
const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
@@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
...
@@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
}
}
kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]],
const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
@@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
...
@@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
}
}
kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]],
const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) {
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.y >= outTexture.get_height() ||
...
...
metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
浏览文件 @
6ec031ff
...
@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
...
@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
var
description
:
String
{
var
description
:
String
{
var
str
=
""
var
str
=
""
for
op
in
ops
{
for
i
in
0
..<
ops
.
count
{
str
+=
op
.
description
str
+=
" op
\(
i
)
: "
let
op
=
ops
[
i
]
str
+=
op
.
description
}
}
for
varDesc
in
vars
{
for
varDesc
in
vars
{
...
...
metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
浏览文件 @
6ec031ff
...
@@ -16,245 +16,246 @@ import Accelerate
...
@@ -16,245 +16,246 @@ import Accelerate
import
Foundation
import
Foundation
protocol
Tensorial
:
CustomStringConvertible
,
CustomDebugStringConvertible
{
protocol
Tensorial
:
CustomStringConvertible
,
CustomDebugStringConvertible
{
var
dim
:
Dim
{
get
set
}
var
dim
:
Dim
{
get
set
}
func
numel
()
->
Int
func
numel
()
->
Int
var
layout
:
DataLayout
{
get
}
var
layout
:
DataLayout
{
get
}
}
}
extension
Tensorial
{
extension
Tensorial
{
func
numel
()
->
Int
{
func
numel
()
->
Int
{
return
dim
.
numel
()
return
dim
.
numel
()
}
}
}
}
class
Tensor
<
P
:
PrecisionType
>
:
Tensorial
{
class
Tensor
<
P
:
PrecisionType
>
:
Tensorial
{
enum
BufferPrecision
{
enum
BufferPrecision
{
case
Float32
,
Float16
case
Float32
,
Float16
}
var
data
:
Data
var
dim
:
Dim
var
buffer
:
MTLBuffer
!
private(set)
var
layout
:
DataLayout
class
Data
{
init
(
inSize
:
Int
,
inPointer
:
UnsafeMutablePointer
<
P
>
)
{
size
=
inSize
pointer
=
inPointer
}
}
let
size
:
Int
var
data
:
Data
var
pointer
:
UnsafeMutablePointer
<
P
>
var
dim
:
Dim
subscript
(
index
:
Int
)
->
P
{
var
buffer
:
MTLBuffer
!
get
{
private(set)
var
layout
:
DataLayout
return
pointer
[
index
]
}
class
Data
{
set
{
init
(
inSize
:
Int
,
inPointer
:
UnsafeMutablePointer
<
P
>
)
{
pointer
[
index
]
=
newValue
size
=
inSize
}
pointer
=
inPointer
}
let
size
:
Int
var
pointer
:
UnsafeMutablePointer
<
P
>
subscript
(
index
:
Int
)
->
P
{
get
{
return
pointer
[
index
]
}
set
{
pointer
[
index
]
=
newValue
}
}
func
release
()
{
pointer
.
deinitialize
(
count
:
size
)
pointer
.
deallocate
()
}
deinit
{
// release()
}
}
}
func
release
()
{
required
init
(
inDim
:
Dim
,
inLayout
:
DataLayout
=
DataLayout
.
NCHW
())
{
pointer
.
deinitialize
(
count
:
size
)
dim
=
inDim
pointer
.
deallocate
()
let
size
=
inDim
.
numel
()
*
MemoryLayout
<
P
>.
size
}
let
pointer
=
UnsafeMutablePointer
<
P
>.
allocate
(
capacity
:
size
)
deinit
{
data
=
Data
.
init
(
inSize
:
size
,
inPointer
:
pointer
)
// release()
layout
=
inLayout
}
}
required
init
(
inDim
:
Dim
,
inLayout
:
DataLayout
=
DataLayout
.
NCHW
())
{
dim
=
inDim
let
size
=
inDim
.
numel
()
*
MemoryLayout
<
P
>.
size
let
pointer
=
UnsafeMutablePointer
<
P
>.
allocate
(
capacity
:
size
)
data
=
Data
.
init
(
inSize
:
size
,
inPointer
:
pointer
)
layout
=
inLayout
}
func
convert
(
to
:
DataLayout
)
{
guard
to
!=
layout
else
{
return
}
}
func
convert
(
to
:
DataLayout
)
{
guard
dim
.
cout
()
==
4
else
{
guard
to
!=
layout
else
{
return
return
}
guard
dim
.
cout
()
==
4
else
{
return
}
guard
layout
==
DataLayout
.
NCHW
()
&&
to
==
DataLayout
.
NHWC
()
else
{
// other not support
return
}
let
newPointer
=
UnsafeMutablePointer
<
P
>.
allocate
(
capacity
:
data
.
size
)
if
layout
==
DataLayout
.
NCHW
()
{
NCHW2NHWC
(
newPtr
:
newPointer
)
}
data
.
release
()
data
.
pointer
=
newPointer
layout
=
to
}
}
func
float32ToFloat16
(
input
:
UnsafeMutablePointer
<
Float32
>
,
output
:
UnsafeMutableRawPointer
,
count
:
Int
)
{
guard
layout
==
DataLayout
.
NCHW
()
&&
to
==
DataLayout
.
NHWC
()
else
{
var
float32Buffer
=
vImage_Buffer
(
data
:
input
,
height
:
1
,
width
:
UInt
(
count
),
rowBytes
:
count
*
4
)
// other not support
var
float16buffer
=
vImage_Buffer
(
data
:
output
,
height
:
1
,
width
:
UInt
(
count
),
rowBytes
:
count
*
2
)
return
guard
vImageConvert_PlanarFtoPlanar16F
(
&
float32Buffer
,
&
float16buffer
,
0
)
==
kvImageNoError
else
{
fatalError
(
" float 32 to float 16 error ! "
)
}
}
}
let
newPointer
=
UnsafeMutablePointer
<
P
>.
allocate
(
capacity
:
data
.
size
)
func
initBuffer
(
device
:
MTLDevice
,
precision
:
BufferPrecision
=
.
Float32
)
{
if
layout
==
DataLayout
.
NCHW
()
{
guard
let
floatPointer
=
data
.
pointer
as?
UnsafeMutablePointer
<
Float32
>
else
{
NCHW2NHWC
(
newPtr
:
newPointer
)
fatalError
(
" not support yet "
)
}
let
precisionSize
:
Int
switch
precision
{
case
.
Float32
:
precisionSize
=
4
case
.
Float16
:
precisionSize
=
2
}
if
dim
.
cout
()
==
4
{
if
layout
==
DataLayout
.
NHWC
()
{
let
C
=
dim
[
3
]
let
cSlices
=
(
C
+
3
)
/
4
let
paddedC
=
cSlices
*
4
let
count
=
paddedC
*
dim
[
0
]
*
dim
[
1
]
*
dim
[
2
]
if
C
==
paddedC
{
buffer
=
device
.
makeBuffer
(
length
:
count
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
count
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
count
)
}
}
else
if
C
==
1
{
buffer
=
device
.
makeBuffer
(
length
:
numel
()
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
numel
()
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
numel
())
}
}
else
{
buffer
=
device
.
makeBuffer
(
length
:
count
*
precisionSize
)
let
convertedPointer
=
UnsafeMutablePointer
<
Float32
>.
allocate
(
capacity
:
count
)
var
tmpPointer
=
floatPointer
var
dstPtr
=
convertedPointer
for
_
in
0
..<
dim
[
0
]
*
dim
[
1
]
*
dim
[
2
]
{
for
j
in
0
..<
paddedC
{
if
j
<
C
{
dstPtr
[
j
]
=
tmpPointer
[
j
]
}
}
tmpPointer
+=
C
dstPtr
+=
paddedC
}
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
convertedPointer
,
byteCount
:
count
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
convertedPointer
,
output
:
buffer
.
contents
(),
count
:
count
)
}
convertedPointer
.
deinitialize
(
count
:
count
)
convertedPointer
.
deallocate
()
}
}
}
else
if
dim
.
cout
()
==
1
{
buffer
=
device
.
makeBuffer
(
length
:
numel
()
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
numel
()
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
numel
())
}
}
else
{
fatalError
(
" not support !"
)
}
//TODO: release
data
.
release
()
}
}
var
width
:
Int
{
data
.
release
()
get
{
data
.
pointer
=
newPointer
if
dim
.
cout
()
==
4
{
layout
=
to
return
dim
[
1
]
}
}
else
{
fatalError
()
func
float32ToFloat16
(
input
:
UnsafeMutablePointer
<
Float32
>
,
output
:
UnsafeMutableRawPointer
,
count
:
Int
)
{
}
var
float32Buffer
=
vImage_Buffer
(
data
:
input
,
height
:
1
,
width
:
UInt
(
count
),
rowBytes
:
count
*
4
)
}
var
float16buffer
=
vImage_Buffer
(
data
:
output
,
height
:
1
,
width
:
UInt
(
count
),
rowBytes
:
count
*
2
)
guard
vImageConvert_PlanarFtoPlanar16F
(
&
float32Buffer
,
&
float16buffer
,
0
)
==
kvImageNoError
else
{
fatalError
(
" float 32 to float 16 error ! "
)
}
}
func
initBuffer
(
device
:
MTLDevice
,
precision
:
BufferPrecision
=
.
Float32
)
{
guard
let
floatPointer
=
data
.
pointer
as?
UnsafeMutablePointer
<
Float32
>
else
{
fatalError
(
" not support yet "
)
}
}
var
height
:
Int
{
let
precisionSize
:
Int
get
{
switch
precision
{
if
dim
.
cout
()
==
4
{
case
.
Float32
:
return
dim
[
2
]
precisionSize
=
4
}
else
{
case
.
Float16
:
fatalError
()
precisionSize
=
2
}
}
}
}
var
channel
:
Int
{
if
dim
.
cout
()
==
4
{
get
{
if
layout
==
DataLayout
.
NHWC
()
{
if
dim
.
cout
()
==
4
{
let
C
=
dim
[
3
]
return
dim
[
3
]
let
cSlices
=
(
C
+
3
)
/
4
}
else
{
let
paddedC
=
cSlices
*
4
fatalError
()
let
count
=
paddedC
*
dim
[
0
]
*
dim
[
1
]
*
dim
[
2
]
if
C
==
paddedC
{
buffer
=
device
.
makeBuffer
(
length
:
count
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
count
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
count
)
}
}
else
if
C
==
1
{
buffer
=
device
.
makeBuffer
(
length
:
numel
()
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
numel
()
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
numel
())
}
}
else
{
buffer
=
device
.
makeBuffer
(
length
:
count
*
precisionSize
)
let
convertedPointer
=
UnsafeMutablePointer
<
Float32
>.
allocate
(
capacity
:
count
)
var
tmpPointer
=
floatPointer
var
dstPtr
=
convertedPointer
for
_
in
0
..<
dim
[
0
]
*
dim
[
1
]
*
dim
[
2
]
{
for
j
in
0
..<
paddedC
{
if
j
<
C
{
dstPtr
[
j
]
=
tmpPointer
[
j
]
}
}
}
tmpPointer
+=
C
dstPtr
+=
paddedC
}
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
convertedPointer
,
byteCount
:
count
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
convertedPointer
,
output
:
buffer
.
contents
(),
count
:
count
)
}
convertedPointer
.
deinitialize
(
count
:
count
)
convertedPointer
.
deallocate
()
}
}
}
}
else
if
dim
.
cout
()
==
1
{
let
num
=
((
numel
()
+
3
)
/
4
)
*
4
buffer
=
device
.
makeBuffer
(
length
:
num
*
precisionSize
)
switch
precision
{
case
.
Float32
:
buffer
?
.
contents
()
.
copyMemory
(
from
:
data
.
pointer
,
byteCount
:
num
*
MemoryLayout
<
P
>.
stride
)
case
.
Float16
:
float32ToFloat16
(
input
:
floatPointer
,
output
:
buffer
.
contents
(),
count
:
num
)
}
}
else
{
fatalError
(
" not support !"
)
}
}
//TODO: release
data
.
release
()
}
var
width
:
Int
{
get
{
if
dim
.
cout
()
==
4
{
return
dim
[
1
]
}
else
{
fatalError
()
}
}
}
var
height
:
Int
{
get
{
if
dim
.
cout
()
==
4
{
return
dim
[
2
]
}
else
{
fatalError
()
}
}
}
var
channel
:
Int
{
get
{
if
dim
.
cout
()
==
4
{
return
dim
[
3
]
}
else
{
fatalError
()
}
}
}
func
NCHW2NHWC
(
newPtr
:
UnsafeMutablePointer
<
P
>
)
{
let
N
=
dim
[
0
]
let
C
=
dim
[
1
]
let
H
=
dim
[
2
]
let
W
=
dim
[
3
]
let
HXW
=
H
*
W
let
CXHXW
=
C
*
H
*
W
func
NCHW2NHWC
(
newPtr
:
UnsafeMutablePointer
<
P
>
)
{
var
index
:
Int
=
0
let
N
=
dim
[
0
]
for
n
in
0
..<
N
{
let
C
=
dim
[
1
]
for
h
in
0
..<
H
{
let
H
=
dim
[
2
]
for
w
in
0
..<
W
{
let
W
=
dim
[
3
]
for
c
in
0
..<
C
{
let
HXW
=
H
*
W
newPtr
[
index
]
=
data
.
pointer
[
n
*
CXHXW
+
c
*
HXW
+
h
*
W
+
w
]
let
CXHXW
=
C
*
H
*
W
index
+=
1
}
var
index
:
Int
=
0
for
n
in
0
..<
N
{
for
h
in
0
..<
H
{
for
w
in
0
..<
W
{
for
c
in
0
..<
C
{
newPtr
[
index
]
=
data
.
pointer
[
n
*
CXHXW
+
c
*
HXW
+
h
*
W
+
w
]
index
+=
1
}
}
}
}
}
dim
.
swapeDimAt
(
index1
:
1
,
index2
:
3
)
}
}
}
dim
.
swapeDimAt
(
index1
:
1
,
index2
:
3
)
}
}
}
extension
Tensor
{
extension
Tensor
{
var
debugDescription
:
String
{
var
debugDescription
:
String
{
var
str
=
"dim:
\(
dim
)
\n
"
var
str
=
"dim:
\(
dim
)
\n
"
str
+=
"MTLBuffer:
\(
self
.
buffer
)
\n
"
str
+=
"MTLBuffer:
\(
self
.
buffer
)
\n
"
for
i
in
0
..<
buffer
.
length
/
MemoryLayout
<
P
>.
size
{
for
i
in
0
..<
buffer
.
length
/
MemoryLayout
<
P
>.
size
{
str
+=
"
\(
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)[
i
]
)
"
str
+=
"
\(
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)[
i
]
)
"
}
return
str
}
}
return
str
func
logDataPointer
(
header
:
String
=
""
)
{
}
print
(
header
)
var
str
=
""
func
logDataPointer
(
header
:
String
=
""
)
{
str
+=
"data size:
\(
data
.
size
)
\n
"
print
(
header
)
str
+=
"dim:
\(
dim
)
\n
"
var
str
=
"
"
for
i
in
0
..<
numel
()
{
str
+=
"data size:
\(
data
.
size
)
\n
"
str
+=
"
\(
data
.
pointer
[
i
]
)
"
str
+=
"dim:
\(
dim
)
\n
"
}
for
i
in
0
..<
numel
()
{
print
(
str
)
str
+=
"
\(
data
.
pointer
[
i
]
)
"
}
}
print
(
str
)
var
description
:
String
{
}
return
debugDescription
}
var
description
:
String
{
return
debugDescription
}
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录