未验证 提交 ff4fb9b2 编写于 作者: N NazgulLee 提交者: GitHub

1.optimize add by channel; 2. texture reuse should consider param input and...

1.optimize add by channel; 2. texture reuse should consider param input and multi fetch; 3. should also realloc heap when max size become smaller; 4. paddle mobile test disable cache (#1769)

* optimize add by channel

* 1. texture reuse should consider param input and multi fetch; 2.  should also realloc heap when max size become smaller; 3. paddle mobile test disable cache
上级 48442e3f
...@@ -63,7 +63,7 @@ let device = MTLCreateSystemDefaultDevice()! ...@@ -63,7 +63,7 @@ let device = MTLCreateSystemDefaultDevice()!
let commandQueue = device.makeCommandQueue()! let commandQueue = device.makeCommandQueue()!
var timeCosts = [Double]() var timeCosts = [Double]()
var count = 0 var count = 0
var totalCount = 10 var totalCount = 100
var orderedVars: [String] = [] var orderedVars: [String] = []
var varIndex = 0 var varIndex = 0
...@@ -107,7 +107,9 @@ class TestViewController: UIViewController { ...@@ -107,7 +107,9 @@ class TestViewController: UIViewController {
} }
private func getTestInfo() { private func getTestInfo() {
Alamofire.request("\(hostUrlStr)/getTestInfo").validate().responseJSON { (response) in var testInfoRequest = URLRequest(url: URL(string: "\(hostUrlStr)/getTestInfo")!)
testInfoRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(testInfoRequest).validate().responseJSON { (response) in
guard response.result.isSuccess else { guard response.result.isSuccess else {
self.testLog("getTestInfo request error") self.testLog("getTestInfo request error")
return return
...@@ -194,7 +196,9 @@ class TestViewController: UIViewController { ...@@ -194,7 +196,9 @@ class TestViewController: UIViewController {
testResult.model = model testResult.model = model
let modelUrlStr = "\(hostUrlStr)/getFile/\(model.name)" let modelUrlStr = "\(hostUrlStr)/getFile/\(model.name)"
Alamofire.request("\(modelUrlStr)/model").validate().responseData { (response) in var modelRequest = URLRequest(url: URL(string: "\(modelUrlStr)/model")!)
modelRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(modelRequest).validate().responseData { (response) in
guard response.result.isSuccess, let modelData = response.result.value else { guard response.result.isSuccess, let modelData = response.result.value else {
let msg = "get model \(model.name) error" let msg = "get model \(model.name) error"
self.testLog(msg) self.testLog(msg)
...@@ -205,7 +209,9 @@ class TestViewController: UIViewController { ...@@ -205,7 +209,9 @@ class TestViewController: UIViewController {
//let modelData2 = try! Data(contentsOf: URL(fileURLWithPath: Bundle.main.path(forResource: "yolo_model_v3_16", ofType: nil)!)) //let modelData2 = try! Data(contentsOf: URL(fileURLWithPath: Bundle.main.path(forResource: "yolo_model_v3_16", ofType: nil)!))
let modelPtr = UnsafeMutablePointer<UInt8>.allocate(capacity: modelData.count) let modelPtr = UnsafeMutablePointer<UInt8>.allocate(capacity: modelData.count)
NSData(data: modelData).getBytes(modelPtr, length: modelData.count) NSData(data: modelData).getBytes(modelPtr, length: modelData.count)
Alamofire.request("\(modelUrlStr)/params/\(model.paramsPrecision)").validate().responseData(completionHandler: { (response) in var paramsRequest = URLRequest(url: URL(string: "\(modelUrlStr)/params/\(model.paramsPrecision)")!)
paramsRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(paramsRequest).validate().responseData(completionHandler: { (response) in
guard response.result.isSuccess, let paramsData = response.result.value else { guard response.result.isSuccess, let paramsData = response.result.value else {
let msg = "get params \(model.name) error" let msg = "get params \(model.name) error"
self.testLog(msg) self.testLog(msg)
...@@ -244,7 +250,10 @@ class TestViewController: UIViewController { ...@@ -244,7 +250,10 @@ class TestViewController: UIViewController {
} }
let fetchVar = fetchVars[0] let fetchVar = fetchVars[0]
net.inputDim = Dim(inDim: [dims[0], dims[2], dims[3], dims[1]]) net.inputDim = Dim(inDim: [dims[0], dims[2], dims[3], dims[1]])
Alamofire.request("\(modelUrlStr)/data/\(feedVar.name.replacingOccurrences(of: "/", with: "_"))").validate().responseData(completionHandler: { (response) in
var feedVarRequest = URLRequest(url: URL(string: "\(modelUrlStr)/data/\(feedVar.name.replacingOccurrences(of: "/", with: "_"))")!)
feedVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(feedVarRequest).validate().responseData(completionHandler: { (response) in
guard response.result.isSuccess, let inputData = response.result.value else { guard response.result.isSuccess, let inputData = response.result.value else {
let msg = "get var \(feedVar) error" let msg = "get var \(feedVar) error"
self.testLog(msg) self.testLog(msg)
...@@ -309,7 +318,8 @@ class TestViewController: UIViewController { ...@@ -309,7 +318,8 @@ class TestViewController: UIViewController {
for i in 0..<outputSize { for i in 0..<outputSize {
let a = output[i] let a = output[i]
let b = resultHolder.result[i] let b = resultHolder.result[i]
if abs(a - b) > precision && abs(a - b) / min(abs(a), abs(b)) > 0.05 { // && abs(a - b) / min(abs(a), abs(b)) > 0.05
if abs(a - b) > precision {
isResultEqual = false isResultEqual = false
msg = "unequal: i: \(i) target: \(output[i]) result: \(resultHolder.result[i])" msg = "unequal: i: \(i) target: \(output[i]) result: \(resultHolder.result[i])"
self.testLog(msg) self.testLog(msg)
...@@ -403,7 +413,9 @@ class TestViewController: UIViewController { ...@@ -403,7 +413,9 @@ class TestViewController: UIViewController {
} }
} }
if severVars.contains(varName) { if severVars.contains(varName) {
Alamofire.request("\(urlString)/\(varName)").validate().responseData { (response) in var severVarRequest = URLRequest(url: URL(string: "\(urlString)/\(varName)")!)
severVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(severVarRequest).validate().responseData { (response) in
varIndex += 1 varIndex += 1
guard response.result.isSuccess, let varData = response.result.value else { guard response.result.isSuccess, let varData = response.result.value else {
self.compareVars(runner: runner, model: model, completion: completion) self.compareVars(runner: runner, model: model, completion: completion)
......
...@@ -109,6 +109,7 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { ...@@ -109,6 +109,7 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
struct ElementwiseAddParam { struct ElementwiseAddParam {
int32_t fast; int32_t fast;
int32_t addByChannel;
int32_t axis; int32_t axis;
int32_t ylen; int32_t ylen;
int32_t xdim[4]; int32_t xdim[4];
......
...@@ -19,8 +19,10 @@ using namespace metal; ...@@ -19,8 +19,10 @@ using namespace metal;
half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) { half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
half4 output; half4 output;
if (addParam.fast) { if (addParam.fast == 1) {
output = biasTexture.read(gid.xy, gid.z); output = biasTexture.read(gid.xy, gid.z);
} else if (addParam.addByChannel == 1) {
output = biasTexture.read(uint2(0, 0), gid.z);
} else { } else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
...@@ -44,8 +46,10 @@ half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_a ...@@ -44,8 +46,10 @@ half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_a
float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) { float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
float4 output; float4 output;
if (addParam.fast) { if (addParam.fast == 1) {
output = float4(biasTexture.read(gid.xy, gid.z)); output = float4(biasTexture.read(gid.xy, gid.z));
} else if (addParam.addByChannel == 1) {
output = float4(biasTexture.read(uint2(0, 0), gid.z));
} else { } else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
......
...@@ -26,12 +26,12 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur ...@@ -26,12 +26,12 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
float4 rx, ry; float4 rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) { if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else { } else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
...@@ -62,12 +62,12 @@ kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[te ...@@ -62,12 +62,12 @@ kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[te
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
half4 rx, ry; half4 rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) { if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else { } else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
......
...@@ -37,12 +37,12 @@ kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::re ...@@ -37,12 +37,12 @@ kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::re
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
VECTOR(P, 4) rx, ry; VECTOR(P, 4) rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) { if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else { } else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
......
...@@ -160,7 +160,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -160,7 +160,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
try setupWithMPS(device: device, param: param) try setupWithMPS(device: device, param: param)
} else { } else {
if functionName == nil { if functionName == nil {
fatalError(" unsupport yet ") throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
} }
try super.init(device: device, inFunctionName: functionName, initContext: initContext) try super.init(device: device, inFunctionName: functionName, initContext: initContext)
try setupWithoutMPS(device: device, param: param) try setupWithoutMPS(device: device, param: param)
...@@ -371,7 +371,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -371,7 +371,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
} }
private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool { private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
if let y = param.y, y.dim.dims == param.input.dim.dims { if let y = param.y, y.dim.dims == param.output.dim.dims {
return true return true
} }
return false return false
......
...@@ -46,7 +46,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -46,7 +46,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
try setupWithMPS(device: device, param: param) try setupWithMPS(device: device, param: param)
} else { } else {
if functionName == nil { if functionName == nil {
fatalError(" unsupport yet ") throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
} }
try super.init(device: device, inFunctionName: functionName, initContext: initContext) try super.init(device: device, inFunctionName: functionName, initContext: initContext)
try setupWithoutMPS(device: device, param: param) try setupWithoutMPS(device: device, param: param)
......
...@@ -16,6 +16,7 @@ import Foundation ...@@ -16,6 +16,7 @@ import Foundation
struct ElementwiseAddMetalParam { struct ElementwiseAddMetalParam {
var fast: Int32 = 0 var fast: Int32 = 0
var addByChannel: Int32 = 0
var axis: Int32 = 0 var axis: Int32 = 0
var ylen: Int32 = 0 var ylen: Int32 = 0
var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
...@@ -91,6 +92,9 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -91,6 +92,9 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
// print("===> elementwise_add fast!!!") // print("===> elementwise_add fast!!!")
metalParam.fast = 1 metalParam.fast = 1
} }
if inputY.tensorDim.cout() == 1 && (axis == 1 || (axis == -1 && inputY.tensorDim.dims[0] == inputX.padToFourDim[1])) {
metalParam.addByChannel = 1
}
return metalParam return metalParam
} }
} }
...@@ -23,27 +23,7 @@ class ElementwiseAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -23,27 +23,7 @@ class ElementwiseAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
metalParam = ElementwiseAddMetalParam.init() metalParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if param.axis == -1 {
metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
}
metalParam.ylen = Int32(param.inputY.tensorDim.cout())
if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
if GlobalConfig.shared.computePrecision == .Float32 { if GlobalConfig.shared.computePrecision == .Float32 {
if param.mode == "channel" { if param.mode == "channel" {
......
...@@ -66,6 +66,7 @@ class MemoryOptimize: MemoryManager { ...@@ -66,6 +66,7 @@ class MemoryOptimize: MemoryManager {
var createdNodes = [String: Node]() var createdNodes = [String: Node]()
var nodesArray = [Node]() var nodesArray = [Node]()
let scope = program.scope let scope = program.scope
var fetchVarNames: [String] = []
func appendNodes(textureDic: [String: [String]], varsDic: [String: PMVarDesc]) { func appendNodes(textureDic: [String: [String]], varsDic: [String: PMVarDesc]) {
for dicPair in textureDic { for dicPair in textureDic {
for varName in dicPair.value { for varName in dicPair.value {
...@@ -94,9 +95,16 @@ class MemoryOptimize: MemoryManager { ...@@ -94,9 +95,16 @@ class MemoryOptimize: MemoryManager {
varsDic[varDesc.name] = varDesc varsDic[varDesc.name] = varDesc
} }
for op in block.ops { for op in block.ops {
if op.type == gFetchType {
for names in op.inputs.values {
fetchVarNames.append(contentsOf: names)
}
}
appendNodes(textureDic: op.inputs, varsDic: varsDic) appendNodes(textureDic: op.inputs, varsDic: varsDic)
appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
appendNodes(textureDic: op.outputs, varsDic: varsDic) appendNodes(textureDic: op.outputs, varsDic: varsDic)
appendNodes(textureDic: op.inputs, varsDic: varsDic) appendNodes(textureDic: op.inputs, varsDic: varsDic)
appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
} }
} }
var nodeGroups: [[Node]] = [] var nodeGroups: [[Node]] = []
...@@ -106,7 +114,8 @@ class MemoryOptimize: MemoryManager { ...@@ -106,7 +114,8 @@ class MemoryOptimize: MemoryManager {
node.visited = true node.visited = true
var placed = false var placed = false
for i in 0..<nodeGroups.count { for i in 0..<nodeGroups.count {
if nodeGroups[i].last?.count == 0 { let lastNode = nodeGroups[i].last
if lastNode?.count == 0 && !fetchVarNames.contains(lastNode?.name ?? "") {
nodeGroups[i].append(node) nodeGroups[i].append(node)
placed = true placed = true
break break
...@@ -156,7 +165,7 @@ class MemoryBucket { ...@@ -156,7 +165,7 @@ class MemoryBucket {
public func allocHeap() { public func allocHeap() {
let size = maxSizeForTextures(textures) let size = maxSizeForTextures(textures)
if size > (heap?.size ?? 0) { if size != (heap?.size ?? 0) {
heap?.setPurgeableState(.empty) heap?.setPurgeableState(.empty)
heap = makeHeapForSize(size) heap = makeHeapForSize(size)
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册