未验证 提交 63e8bb92 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #556 from codeWorm2015/metal

correct buffer
......@@ -29,11 +29,11 @@ class ViewController: UIViewController {
// let queue: MTLCommandQueue
func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, complete: @escaping (MTLTexture) -> Void) {
let tmpTextureDes = MTLTextureDescriptor.init()
tmpTextureDes.width = 227
tmpTextureDes.height = 227
tmpTextureDes.width = 224
tmpTextureDes.height = 224
tmpTextureDes.depth = 1
tmpTextureDes.usage = [.shaderRead, .shaderWrite]
tmpTextureDes.pixelFormat = .rgba16Float
tmpTextureDes.pixelFormat = .rgba32Float
tmpTextureDes.textureType = .type2D
tmpTextureDes.storageMode = .shared
tmpTextureDes.cpuCacheMode = .defaultCache
......@@ -64,23 +64,18 @@ class ViewController: UIViewController {
}
scaleTexture(queue: queue!, input: inTexture) { (inputTexture) in
let loader = Loader<Float16>.init()
let loader = Loader<Float32>.init()
do {
let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
let program = try loader.load(device: self.device, modelPath: modelPath, paraPath: paraPath)
let executor = try Executor<Float16>.init(inDevice: self.device, inQueue: queue!, inProgram: program)
let output = try executor.predict(input: inputTexture, expect: [1, 227, 227, 3])
let executor = try Executor<Float32>.init(inDevice: self.device, inQueue: queue!, inProgram: program)
let output = try executor.predict(input: inputTexture, expect: [1, 224, 224, 3])
// print(output)
} catch let error {
print(error)
}
}
}
}
......@@ -93,7 +93,7 @@ public extension MTLTexture {
print("texture: \(self)")
if textureType == .type2DArray {
for i in 0..<arrayLength{
var str: String = "slice: \(i): "
var str: String = "slice: \(i): \n"
let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
......@@ -142,8 +142,25 @@ public extension MTLTexture {
}
public extension MTLBuffer {
func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
print(header)
print("MTLBuffer: \(self) ")
var str = ""
if stridable && length/MemoryLayout<T>.stride > 1000{
for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
str += " \(contents().assumingMemoryBound(to: T.self)[j])"
}
} else {
for i in 0..<length/MemoryLayout<T>.size {
str += " \(contents().assumingMemoryBound(to: T.self)[i])"
}
}
print(str)
return nil
}
}
......
......@@ -55,17 +55,8 @@ public class Executor<P: PrecisionType> {
device = inDevice
queue = inQueue
for block in inProgram.programDesc.blocks {
// for i in 0..<2 {
// let op = block.ops[i]
// do {
// let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
// op.inferShape()
// ops.append(op)
// } catch let error {
// throw error
// }
// }
for op in block.ops {
for i in 0..<2 {
let op = block.ops[i]
do {
let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
op.inferShape()
......@@ -74,6 +65,15 @@ public class Executor<P: PrecisionType> {
throw error
}
}
// for op in block.ops {
// do {
// let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
// op.inferShape()
// ops.append(op)
// } catch let error {
// throw error
// }
// }
}
}
......@@ -95,9 +95,9 @@ public class Executor<P: PrecisionType> {
buffer.addCompletedHandler { (commandbuffer) in
// for op in self.ops {
// op.delogOutput()
// }
for op in self.ops {
op.delogOutput()
}
let afterDate = Date.init()
print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
......@@ -114,7 +114,6 @@ public class Executor<P: PrecisionType> {
throw PaddleMobileError.netError(message: "output var type error")
}
return output
}
......
......@@ -50,7 +50,7 @@ public class Loader<P: PrecisionType> {
return pointee
}
_ = pointerReader(type: UInt32.self)
let _ = pointerReader(type: UInt32.self)
let lodLevel = pointerReader(type: UInt64.self)
for _ in 0..<lodLevel {
let size = pointerReader(type: UInt64.self)
......@@ -62,6 +62,7 @@ public class Loader<P: PrecisionType> {
let _ = pointerReader(type: UInt32.self)
let tensorDescSize = pointerReader(type: Int32.self)
fseek(file, Int(tensorDescSize), SEEK_CUR)
nowIndex += Int(tensorDescSize)
......@@ -70,21 +71,21 @@ public class Loader<P: PrecisionType> {
*/
//现在模型传入模型为 Float 类型, 这块应该根据模型来
let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
// let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
// let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
// let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
// guard bytesRead == tensor.data.size else {
// throw PaddleMobileError.loaderError(message: "param read size error")
// }
guard bytesRead == tensor.data.size else {
throw PaddleMobileError.loaderError(message: "param read size error")
}
// TODO: use script to convert
let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
for i in 0..<tensor.numel() {
tensor.data[i] = P.init(inFloat: tmpPointer[i])
}
tmpPointer.deinitialize(count: tmpCapacity)
tmpPointer.deallocate()
// let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
// for i in 0..<tensor.numel() {
// tensor.data[i] = P.init(inFloat: tmpPointer[i])
// }
// tmpPointer.deinitialize(count: tmpCapacity)
// tmpPointer.deallocate()
nowIndex += bytesRead
}
......
......@@ -107,7 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
}
func delogOutput() {
let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
para.filter.logDataPointer(header: "filter data pointer: ")
print("filter: \(para.filter)")
print("biase: \(para.bias)")
let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true)
}
}
......@@ -61,8 +61,8 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<
func delogOutput() {
// para.input.mtlTexture.logDesc()
let _: Float16? = para.input.mtlTexture.logDesc(header: "feed input: ")
let _: Float16? = para.output.metalTexture.logDesc(header: "feed output: ")
// let _: P? = para.input.mtlTexture.logDesc(header: "feed input: ", stridable: true)
// let _: P? = para.output.metalTexture.logDesc(header: "feed output: ", stridable: true)
}
}
......@@ -29,7 +29,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
let inv = pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
invs.append(P(inv))
}
......@@ -59,7 +59,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
}
print("ConvAddBatchNormReluKernel compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
......
......@@ -59,13 +59,56 @@ kernel void conv3x3(texture2d_array<half, access::sample> inTexture [[texture(0)
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
//kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
// texture2d_array<half, access::write> outTexture [[texture(1)]],
// constant MetalConvParam &param [[buffer(0)]],
// const device half4 *weights [[buffer(1)]],
// const device half4 *biase [[buffer(2)]],
// const device half4 *new_scale [[buffer(3)]],
// const device half4 *new_biase [[buffer(4)]],
// uint3 gid [[thread_position_in_grid]]) {
//
// if (gid.x >= outTexture.get_width() ||
// gid.y >= outTexture.get_height() ||
// gid.z >= outTexture.get_array_size()) {
// return;
// }
//
// short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
// const uint wightSliceCount = 36;
// uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
// half4 output = 0.0;
// for (uint i = 0; i < inTexture.get_array_size(); ++i) {
// half4 input[9];
// input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
// input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
// input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
// input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
// input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
// input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
// input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
// input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
// input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
// for (int j = 0; j < 9; ++j) {
// half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
// output += dot(input[j], weight);
// }
// }
//
// output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
// outTexture.write(output, gid.xy, gid.z);
//
//}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
......@@ -78,9 +121,9 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint wightSliceCount = 36;
uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
half4 output = 0.0;
float4 output = 0.0;
for (uint i = 0; i < inTexture.get_array_size(); ++i) {
half4 input[9];
float4 input[9];
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
......@@ -91,12 +134,12 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> i
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
float4 weight = weights[weithTo + wightSliceCount * i + j * 4];
output += dot(input[j], weight);
}
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
}
......
......@@ -73,15 +73,25 @@ kernel void batchnorm(texture2d_array<half, access::read> inTexture [[texture(0)
outTexture.write(input, gid.xy, gid.z);
}
kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
//kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
// texture2d_array<half, access::write> outTexture [[texture(1)]],
// uint3 gid [[thread_position_in_grid]]) {
// if (gid.x >= inTexture.get_width() ||
// gid.y >= inTexture.get_height()){
// return;
// }
// const half4 input = inTexture.read(gid.xy);
// outTexture.write(input, gid.xy, 0);
//}
kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= inTexture.get_width() ||
gid.y >= inTexture.get_height()){
return;
}
const half4 input = inTexture.read(gid.xy);
const float4 input = inTexture.read(gid.xy);
outTexture.write(input, gid.xy, 0);
}
......@@ -32,6 +32,9 @@ extension Int64: Attr {
extension Array: Attr {
}
extension String: Attr {
}
func attrWithProtoDesc(attrDesc: PaddleMobile_Framework_Proto_OpDesc.Attr) -> Attr {
switch attrDesc.type {
case .boolean:
......@@ -39,7 +42,7 @@ func attrWithProtoDesc(attrDesc: PaddleMobile_Framework_Proto_OpDesc.Attr) -> At
case .int:
return Int(attrDesc.i)
case .string:
return attrDesc.strings
return attrDesc.s
case .long:
return attrDesc.l
case .float:
......
......@@ -38,7 +38,7 @@ class Tensor<P: PrecisionType>: Tensorial {
pointer = inPointer
}
let size: Int
fileprivate var pointer: UnsafeMutablePointer<P>
var pointer: UnsafeMutablePointer<P>
subscript(index: Int) -> P{
get {
return pointer[index]
......@@ -104,7 +104,7 @@ class Tensor<P: PrecisionType>: Tensorial {
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr?[j] = data.pointer[j]
dstPtr?[j] = tmpPointer[j]
}
}
tmpPointer += C
......@@ -134,7 +134,7 @@ class Tensor<P: PrecisionType>: Tensorial {
for h in 0..<H{
for w in 0..<W{
for c in 0..<C{
newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * w + w]
newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
index += 1
}
}
......@@ -146,27 +146,25 @@ class Tensor<P: PrecisionType>: Tensorial {
extension Tensor {
var debugDescription: String {
var str = ""
// for i in 0..<buffer.length/MemoryLayout<P>.stride {
// str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
// }
var debugDescription: String {
var str = "dim: \(dim) \n"
str += "MTLBuffer: \(self.buffer) \n"
for i in 0..<buffer.length/MemoryLayout<P>.size {
str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
}
return str
}
// var str = ""
// str += "Dim: \(dim) \n value:[ "
// if data.size < 20 {
// for d in 0..<data.size {
// str += " \(data[d]) "
// }
// } else {
// for d in stride(from: 0, to: data.size, by: data.size/20) {
// str += " \(data[d]) "
// }
// }
// str += " ]"
// return str
func logDataPointer(header: String = "") {
print(header)
var str = ""
str += "data size: \(data.size) \n"
str += "dim: \(dim) \n"
for i in 0..<numel() {
str += " \(data.pointer[i])"
}
print(str)
}
var description: String {
......
......@@ -69,7 +69,7 @@ public class Texture<P: PrecisionType>: Tensorial {
if MemoryLayout<P>.size == 1 {
tmpTextureDes.pixelFormat = .rgba8Unorm
} else if MemoryLayout<P>.size == 2 {
tmpTextureDes.pixelFormat = .rgba16Float
tmpTextureDes.pixelFormat = .rgba32Float
} else if MemoryLayout<P>.size == 4 {
// tmpTextureDes.pixelFormat = .r32Float
tmpTextureDes.pixelFormat = .rgba32Float
......@@ -130,7 +130,7 @@ extension Texture {
public var debugDescription: String{
var str = ""
str += "Dim: \(dim) \n value:[ "
// str += "\(metalTexture)"
str += "\(metalTexture)"
str += " ]"
return str
}
......
......@@ -19,7 +19,10 @@ limitations under the License. */
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto time1 = time();
auto program = loader.Load(g_mobilenet, true);
// auto program = loader.Load(g_mobilenet_combine, true);
auto program = loader.Load(g_mobilenet_combine + "/model",
g_mobilenet_combine + "/params", true);
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册