未验证 提交 ff4fb9b2 编写于 作者: N NazgulLee 提交者: GitHub

1.optimize add by channel; 2. texture reuse should consider param input and...

1.optimize add by channel; 2. texture reuse should consider param input and multi fetch; 3. should also realloc heap when max size become smaller; 4. paddle mobile test disable cache (#1769)

* optimize add by channel

* 1. texture reuse should consider param input and multi fetch; 2.  should also realloc heap when max size become smaller; 3. paddle mobile test disable cache
上级 48442e3f
......@@ -63,7 +63,7 @@ let device = MTLCreateSystemDefaultDevice()!
let commandQueue = device.makeCommandQueue()!
var timeCosts = [Double]()
var count = 0
var totalCount = 10
var totalCount = 100
var orderedVars: [String] = []
var varIndex = 0
......@@ -107,7 +107,9 @@ class TestViewController: UIViewController {
}
private func getTestInfo() {
Alamofire.request("\(hostUrlStr)/getTestInfo").validate().responseJSON { (response) in
var testInfoRequest = URLRequest(url: URL(string: "\(hostUrlStr)/getTestInfo")!)
testInfoRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(testInfoRequest).validate().responseJSON { (response) in
guard response.result.isSuccess else {
self.testLog("getTestInfo request error")
return
......@@ -194,7 +196,9 @@ class TestViewController: UIViewController {
testResult.model = model
let modelUrlStr = "\(hostUrlStr)/getFile/\(model.name)"
Alamofire.request("\(modelUrlStr)/model").validate().responseData { (response) in
var modelRequest = URLRequest(url: URL(string: "\(modelUrlStr)/model")!)
modelRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(modelRequest).validate().responseData { (response) in
guard response.result.isSuccess, let modelData = response.result.value else {
let msg = "get model \(model.name) error"
self.testLog(msg)
......@@ -205,7 +209,9 @@ class TestViewController: UIViewController {
//let modelData2 = try! Data(contentsOf: URL(fileURLWithPath: Bundle.main.path(forResource: "yolo_model_v3_16", ofType: nil)!))
let modelPtr = UnsafeMutablePointer<UInt8>.allocate(capacity: modelData.count)
NSData(data: modelData).getBytes(modelPtr, length: modelData.count)
Alamofire.request("\(modelUrlStr)/params/\(model.paramsPrecision)").validate().responseData(completionHandler: { (response) in
var paramsRequest = URLRequest(url: URL(string: "\(modelUrlStr)/params/\(model.paramsPrecision)")!)
paramsRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(paramsRequest).validate().responseData(completionHandler: { (response) in
guard response.result.isSuccess, let paramsData = response.result.value else {
let msg = "get params \(model.name) error"
self.testLog(msg)
......@@ -244,7 +250,10 @@ class TestViewController: UIViewController {
}
let fetchVar = fetchVars[0]
net.inputDim = Dim(inDim: [dims[0], dims[2], dims[3], dims[1]])
Alamofire.request("\(modelUrlStr)/data/\(feedVar.name.replacingOccurrences(of: "/", with: "_"))").validate().responseData(completionHandler: { (response) in
var feedVarRequest = URLRequest(url: URL(string: "\(modelUrlStr)/data/\(feedVar.name.replacingOccurrences(of: "/", with: "_"))")!)
feedVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(feedVarRequest).validate().responseData(completionHandler: { (response) in
guard response.result.isSuccess, let inputData = response.result.value else {
let msg = "get var \(feedVar) error"
self.testLog(msg)
......@@ -309,7 +318,8 @@ class TestViewController: UIViewController {
for i in 0..<outputSize {
let a = output[i]
let b = resultHolder.result[i]
if abs(a - b) > precision && abs(a - b) / min(abs(a), abs(b)) > 0.05 {
// && abs(a - b) / min(abs(a), abs(b)) > 0.05
if abs(a - b) > precision {
isResultEqual = false
msg = "unequal: i: \(i) target: \(output[i]) result: \(resultHolder.result[i])"
self.testLog(msg)
......@@ -403,7 +413,9 @@ class TestViewController: UIViewController {
}
}
if severVars.contains(varName) {
Alamofire.request("\(urlString)/\(varName)").validate().responseData { (response) in
var severVarRequest = URLRequest(url: URL(string: "\(urlString)/\(varName)")!)
severVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
Alamofire.request(severVarRequest).validate().responseData { (response) in
varIndex += 1
guard response.result.isSuccess, let varData = response.result.value else {
self.compareVars(runner: runner, model: model, completion: completion)
......
......@@ -109,6 +109,7 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
struct ElementwiseAddParam {
int32_t fast;
int32_t addByChannel;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
......
......@@ -19,8 +19,10 @@ using namespace metal;
half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
half4 output;
if (addParam.fast) {
if (addParam.fast == 1) {
output = biasTexture.read(gid.xy, gid.z);
} else if (addParam.addByChannel == 1) {
output = biasTexture.read(uint2(0, 0), gid.z);
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
......@@ -44,8 +46,10 @@ half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_a
float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
float4 output;
if (addParam.fast) {
if (addParam.fast == 1) {
output = float4(biasTexture.read(gid.xy, gid.z));
} else if (addParam.addByChannel == 1) {
output = float4(biasTexture.read(uint2(0, 0), gid.z));
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
......
......@@ -26,12 +26,12 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
float4 rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
......@@ -62,12 +62,12 @@ kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[te
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
half4 rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
......
......@@ -37,12 +37,12 @@ kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::re
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
VECTOR(P, 4) rx, ry;
rx = inputX.read(gid.xy, gid.z);
if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else if (pm.addByChannel == 1) {
ry = inputY.read(uint2(0, 0), gid.z);
} else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
......
......@@ -160,7 +160,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
try setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
}
try super.init(device: device, inFunctionName: functionName, initContext: initContext)
try setupWithoutMPS(device: device, param: param)
......@@ -371,7 +371,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
}
private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
if let y = param.y, y.dim.dims == param.input.dim.dims {
if let y = param.y, y.dim.dims == param.output.dim.dims {
return true
}
return false
......
......@@ -46,7 +46,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
try setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
}
try super.init(device: device, inFunctionName: functionName, initContext: initContext)
try setupWithoutMPS(device: device, param: param)
......
......@@ -16,6 +16,7 @@ import Foundation
struct ElementwiseAddMetalParam {
var fast: Int32 = 0
var addByChannel: Int32 = 0
var axis: Int32 = 0
var ylen: Int32 = 0
var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
......@@ -91,6 +92,9 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
if inputY.tensorDim.cout() == 1 && (axis == 1 || (axis == -1 && inputY.tensorDim.dims[0] == inputX.padToFourDim[1])) {
metalParam.addByChannel = 1
}
return metalParam
}
}
......@@ -23,27 +23,7 @@ class ElementwiseAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
metalParam = ElementwiseAddMetalParam.init()
let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if param.axis == -1 {
metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
}
metalParam.ylen = Int32(param.inputY.tensorDim.cout())
if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
metalParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
if GlobalConfig.shared.computePrecision == .Float32 {
if param.mode == "channel" {
......
......@@ -66,6 +66,7 @@ class MemoryOptimize: MemoryManager {
var createdNodes = [String: Node]()
var nodesArray = [Node]()
let scope = program.scope
var fetchVarNames: [String] = []
func appendNodes(textureDic: [String: [String]], varsDic: [String: PMVarDesc]) {
for dicPair in textureDic {
for varName in dicPair.value {
......@@ -94,9 +95,16 @@ class MemoryOptimize: MemoryManager {
varsDic[varDesc.name] = varDesc
}
for op in block.ops {
if op.type == gFetchType {
for names in op.inputs.values {
fetchVarNames.append(contentsOf: names)
}
}
appendNodes(textureDic: op.inputs, varsDic: varsDic)
appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
appendNodes(textureDic: op.outputs, varsDic: varsDic)
appendNodes(textureDic: op.inputs, varsDic: varsDic)
appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
}
}
var nodeGroups: [[Node]] = []
......@@ -106,7 +114,8 @@ class MemoryOptimize: MemoryManager {
node.visited = true
var placed = false
for i in 0..<nodeGroups.count {
if nodeGroups[i].last?.count == 0 {
let lastNode = nodeGroups[i].last
if lastNode?.count == 0 && !fetchVarNames.contains(lastNode?.name ?? "") {
nodeGroups[i].append(node)
placed = true
break
......@@ -156,7 +165,7 @@ class MemoryBucket {
public func allocHeap() {
let size = maxSizeForTextures(textures)
if size > (heap?.size ?? 0) {
if size != (heap?.size ?? 0) {
heap?.setPurgeableState(.empty)
heap = makeHeapForSize(size)
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册