高质量博客
-
onnx 转 tf 报错,cannot import name ‘dtensor‘ from ‘tensorflow.compat.v2.experimental,原因 tf 版本和 keras 版本不一样。tf 是 2.6.0,keras 也设置成 2.6.0。报错:cannot import name ‘dtensor‘ from ‘tensorflow.compat.v2.experimental‘ (/Users/pxs/anaconda3/lib_wdm-student的博客-CSDN博客
-
模型可视化网站 netron.app
ONNX==>TNN 的有问题,还是老老实实按官方文档的来 一键转换 Caffe, ONNX, TensorFlow 到 NCNN, MNN, Tengine (convertmodel.com)
加速模型推理
ONNX
将 PyTorch 模型导出为 ONNX 模型进行推理
- 安装 onnx, onnxruntime,
pip install onnx
pip install onnxruntime
(CPU 版本) - 导出为 onnx 模型。
- 使用 onnxruntime 运行 onnx 模型。
import torch
from models import get_model_resnet
def model_converter():
weight = torch.load('xxx/best34.pt')
model = get_model_resnet(net_name="resnet34")
model.load_state_dict(weight)
model.eval()
# 静态输入,数据的shape是固定的,固定数据shape是根据任务来的,这里做的是一个分级任务,我就固定shape了
dummy_input = torch.randn(1, 3, 720, 720, device='cpu')
input_names = ['data']
output_names = ['output']
torch.onnx.export(model, dummy_input, 'resnet34.onnx',
export_params=True,
verbose=True,
input_names=input_names,
output_names=output_names)
model_converter()
使用 ONNX 加载模型进行推理
from time import time
import onnxruntime # 此处用的 cpu 版本
import cv2
import numpy as np
import time
def softmax(x):
x -= np.max(x, axis= 1, keepdims=True)
f_x = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
return f_x
def predict():
for i in range(0,200):
image = cv2.imread('xxx/163.png')
image = cv2.resize(image,(720,720)) # 与导出的onnx的input保持一致
height,width = image.shape[:2]# H,W,C
image = image/np.max(image)
tmpImg = np.zeros((height,width,3))
start = time.time()
tmpImg[:,:,0] = (image[:,:,2]-0.406)/0.225
tmpImg[:,:,1] = (image[:,:,1]-0.456)/0.224
tmpImg[:,:,2] = (image[:,:,0]-0.485)/0.229
tmpImg = tmpImg.transpose((2, 0, 1)).astype(np.float32)# HWC->CHW
tmpImg = tmpImg[np.newaxis,:,:,:]# CHW->NCHW
# 推理
# providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
providers=[ 'CPUExecutionProvider']
onnx_session = onnxruntime.InferenceSession('/xxx/resnet34.onnx',providers=providers)
onnx_input = {'data':tmpImg}
onnx_output = onnx_session.run(['output'],onnx_input)[0]
prob = softmax(onnx_output) # softmax 转换为置信度值
print(time.time()-start) # 计算推理时间
# 从 0.67~0.61 优化到了 0.42~0.31
predict()
如果要用 GPU ONNX 进行推理的话,需要安装 GPU 版本,pip install onnxruntime-gpu
TensorRT
直接用库。https://github.com/NVIDIA-AI-IOT/torch2trt
import torchvision
import torch
import time
from torch2trt import torch2trt
model = torchvision.models.resnet34(pretrained=False).cuda().half().eval()
data = torch.randn((1, 3, 224, 224)).cuda().half()
print("start trt")
tmp = time.time()
model_trt = torch2trt(model, [data], fp16_mode=True)
print(f"finish trt:{time.time()-tmp}")
trt_start = time.time()
output_trt = model_trt(data) # tensorrt 0.001
print(time.time()-trt_start)
model_start = time.time()
output = model(data)
print(time.time()-model_start) # pytorch 0.005
torch.save(model_trt.state_dict(), 'resnet18_trt.pth')
print(output.flatten()[0:10])
print(output_trt.flatten()[0:10])
print('max error: %f' % float(torch.max(torch.abs(output - output_trt))))
NumPy加速
pip install numba
@numba.njit 装饰在方法上,可以对方法内的 numpy 进行加速
TNN
手机端的模型部署。支持大多数算子。但是目前还不支持五维度的 cat、permute 等操作。
ONNX2TNN
ONNX 到 TNN 的转换。
-
创建 conda 环境,安装如下库。注意要指定 protobuf 的版本为 3.20.x 及以下。如果其他库提示版本过高或过低,按要求换版本即可。
pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier protobuf==3.20.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
-
在 Ubuntu 中拉取 TNN 库,下面的是 gitee 的克隆仓库。
git clone https://gitee.com/atari/TNN.git
-
进入 TNN/tools/onnx2tnn/onnx-converter 目录,编译
cd <path-to-tnn>/tools/onnx2tnn/onnx-converter ./build.sh
-
使用 onnx-converter 目录下的 onnx2tnn.py 进行模型转换
# 查看帮助信息 python onnx2tnn.py -h # 例如,将 yolov5s-face.onnx 转为 tnn,输入大小为 1,3,640,640, 开启优化 python onnx2tnn.py yolov5s-face.onnx -version=v1.0 -optimize=1 -o ./ -input_shape input:1,3,640,640
5.转换后将生成的模型和数据拉入 natron 中看看是否正确。
注意!切片操作在可视化模型图的名称应当是 StridedSliceV2,之前我转的模型是 StridedSlice,结果运行一直报错。
TNN项目构建
。。。暂定
TNN调试
- 修改项目目录下 /source/tnn/utils/blob_dump_utils.h 中
- #define DUMP_INPUT_BLOB 0 –> #define DUMP_INPUT_BLOB 1,获取每层输入
- #define DUMP_OUTPUT_BLOB 0 –> #define DUMP_OUTPUT_BLOB 1,获取每层输出
- 仅作为调试使用
ONNX
onnx2trt
用代码从 onnx2trt 太慢了。这里用官方的工具来做。
git clone --recursive -b 8.2-EA https://github.com/onnx/onnx-tensorrt.git
cd onnx-tensorrt
mkdir build
cd build
# /path/to/TensorRT-8.2.4.2改成自己的TensorRT绝对路径
cmake .. -DTENSORRT_ROOT=/path/to/TensorRT-8.2.4.2
make -j8
make install
torch2onnx
torch 导出为 ONNX 模型
import argparse
import sys
import time
import os
import numpy as np
import cv2
sys.path.append('./')
import torch
import torch.nn as nn
import models
from models.experimental import attempt_load
from utils.activations import Hardswish, SiLU
import onnx
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str, default='/home/payphone/work/yolov5-face/weights/yolov5s-face.pt', help='weights path') # from yolov5/models/
parser.add_argument('--batch_size', type=int, default=1, help='batch size')
parser.add_argument('--dynamic', action='store_true', default=False, help='enable dynamic axis in onnx model')
parser.add_argument('--onnx_infer', action='store_true', default=True, help='onnx infer test')
opt = parser.parse_args()
# Load PyTorch model
model = attempt_load(opt.weights, map_location=torch.device('cpu')) # load FP32 model
delattr(model.model[-1], 'anchor_grid') # yolo 导出模型时都做了这个操作
model.model[-1].anchor_grid=[torch.zeros(1)] * 3 # nl=3 number of detection layers
model.model[-1].export_cat = True
model.eval()
labels = model.names
#====================================
img0 = cv2.imread("/home/payphone/work/yolov5-face/data/images/bus.jpg")
H,W,C = img0.shape
origin = np.zeros(shape=(np.max(img0.shape),np.max(img0.shape),3))
origin[0:H,0:W,] = img0
origin = cv2.resize(origin,dsize=(640,640))
padding_img = origin.transpose(2, 0, 1).copy()
padding_img = torch.from_numpy(padding_img).float() / 255.0
padding_img = padding_img.unsqueeze(0)
#====================================
# Update model
# 省略一些模型的更新操作
y = model(img)
model.fuse() # only for ONNX
f = "export_model.onnx"
input_names=['input']
output_names=['output']
torch.onnx.export(model, img, f, verbose=False, opset_version=12,
input_names=input_names,
output_names=output_names,
dynamic_axes = {'input': {0: 'batch'},
'output': {0: 'batch'}
} if opt.dynamic else None)
onnx_model = onnx.load(f)
onnx.checker.check_model(onnx_model) # check onnx model
#===============================================================================
# 简化 ONNX 模型 #
#===============================================================================
from onnxsim import simplify
onnx_model, check = simplify(onnx_model)
assert check, "Simplified ONNX model could not be validated"
onnx.save(onnx_model, f)
# # onnx infer
if opt.onnx_infer:
import onnxruntime
import numpy as np
providers = ['CPUExecutionProvider']
session = onnxruntime.InferenceSession(f, providers=providers)
padding_img = padding_img.cpu().numpy().astype(np.float32) # torch to numpy
# [session.get_outputs()[0].name], 为输出数据的名称,可以是一个,也可以是多个,根据网络模型
# 实际的输出情况进行设置,前面 output_name 怎么设置的这里推理的时候 output_name 要对应
y_onnx = session.run([session.get_outputs()[0].name], {session.get_inputs()[0].name: padding_img})[0]
修改 onnx batch
有些 onnx 文件 batch size 设置的动态输入,转 trt 的时候会有点问题,可以自己修改 onnx 的 batch size。如果发现某些分支的 batch size 没有修改就看看是不是遍历的时候忽略了它(比如用到的 key name 不一致)
def change_input_dim(model,):
batch_size = 1
inputs = model.graph.input
for input in inputs:
dim1 = input.type.tensor_type.shape.dim[0]
if isinstance(batch_size, str):
dim1.dim_param = batch_size
elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
dim1.dim_value = int(batch_size)
else:
dim1.dim_value = 1
def apply(transfrom, infile, outfile):
model = onnx.load(infile)
transfrom(model,)
onnx.save(model,outfile)
if __name__=='__main__':
source = '/xxx/face_mesh_Nx3x192x192_post.onnx'
target = "./face_mesh_1x3x192x192_post.onnx"
apply(change_input_dim, source, target)
删除 onnx 分支
ONNX删除节点示例(Deeplabv3plus)_DazeAJD的博客-CSDN博客_onnx删除节点
TensorRT
TensorRT 模型转换
可以直接使用官方的 trt.exe 工具转换,也可以自己写代码转换。先将模型导出为 ONNX,再转 TRT。下面代码为 trt8.x 版本的导出代码
from re import X
import pycuda.driver as cuda
from pip import main
import pycuda.autoinit
import pycuda.driver as cuda
# 转 tensorrt 成功!tensorrt 版本 8.x cuda11.3 cudnn 未知,,,
import tensorrt as trt
import numpy as np
from utils.general import non_max_suppression_face
import tensorrt as trt
import numpy as np
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def GiB(val):
return val * 1 << 30
def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False):
"""
仅适用TensorRT V8版本
生成cudaEngine,并保存引擎文件(仅支持固定输入尺度)
fp16_mode: True则fp16预测
onnx_model_path: 将加载的onnx权重路径
trt_engine_path: trt引擎文件保存路径
"""
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
parser = trt.OnnxParser(network, TRT_LOGGER)
config = builder.create_builder_config()
config.max_workspace_size=GiB(1)
if fp16_mode:
config.set_flag(trt.BuilderFlag.FP16)
with open(onnx_model_path, 'rb') as model:
assert parser.parse(model.read())
serialized_engine=builder.build_serialized_network(network, config)
#tensorrt 7.2.3.4
with open(trt_engine_path, 'wb') as f:
f.write(serialized_engine) # 序列化
print('TensorRT file in ' + trt_engine_path)
print('============ONNX->TensorRT SUCCESS============')
class TrtModel():
'''
TensorRT infer
'''
def __init__(self, trt_path):
self.ctx=cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(trt_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
def __call__(self,img_np_nchw):
'''
TensorRT推理
:param img_np_nchw: 输入图像
'''
self.ctx.push()
# Restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# img_np_nchw.shape = (1, 3, 640, 640) 展平为一维度 (1228800,)
np.copyto(host_inputs[0], img_np_nchw.ravel())
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
self.ctx.pop()
return host_outputs
def destroy(self):
self.ctx.pop()
if __name__ =="__main__":
import cv2
import torch
path = "/home/payphone/work/yolov5-face/data/images/zidane.jpg"
img = cv2.imread(path)
img = cv2.resize(img,dsize=(640,640))/255
img = np.transpose(img,(2,0,1))
img = np.expand_dims(img,0)
trt = TrtModel("/home/payphone/work/yolov5-face/weights/yolov5s-face-fp32.trt")
ans = trt(img)
ans = np.reshape(ans,(1,-1,16))
ans = torch.tensor(ans).cuda()
pred = non_max_suppression_face(ans, 0.1, 0.1)
origin = cv2.imread(path)
origin = cv2.resize(origin,dsize=(640,640))
for result in pred[0]:
x1 = int(result[0])
y1 = int(result[1])
x2 = int(result[2])
y2 = int(result[3])
d = cv2.rectangle(origin,(x1,y1),(x2,y2),(0,255,0),thickness=2)
cv2.imwrite("dd.jpg",d)
trt.ctx.pop()
TFLite
TFLite 定点模型问题
精度下降
- 归一化方式的问题。如归一化到 【0,1】和【-1,-1】在 FP32 下精度差异不大,在 uint8 下差异较大。
- 算子不适配,需要更换算子。如本地推理环境支持 PReLU 算子,但是部署的终端不支持。
输出结果异常
在转换 yolo 模型时出现 score 的值单一化,要么都是 0 要么都是大于 1。
yolo 模型的输出值应该是 (xywh, score, classes)。xywh 是大于 1 的数 score 的值要么是 0,要么都大于 1(比如都是 14.5),这种情况是没有做归一化,将 score 和 xywh 归一化到一个尺度范围内,如(0,1)即可。
tflite 注意事项
- 默认情况 output 的 max=1, min=0;
- Tflite 如果输入数据的范围是(0,1),mean 和 std 应该是:mean=0,std=255;如果输入数据的范围是(-1,1),mean 和 std 应该是:mean=127.5,std=127.5
- netron 可视化的时候如果发现了 tflite 中没有该类算子,大概率是不支持,需要用支持的算子进行替换。
- 如果后处理没有做好模型量化可能会失败。版本问题也需要注意。
- onnx2tflite 的算子
- 算子适配问题相关文档
- https://tensorflow.google.cn/lite/performance/hexagon_delegate?hl=zh-cn
- https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/README.md
转TFLite
tflite2onnx 的库 onnx/tensorflow-onnx: Convert TensorFlow, Keras, Tensorflow.js and Tflite models to ONNX (github.com)
torch ==> tflite
onnx==> tflite
from onnx_tf.backend import prepare
import onnx
import tensorflow as tf
import torch
import numpy as np
import cv2
import sys
from PIL import Image
sys.path.append('./')
from utils.general import non_max_suppression_face
def export_tf(onnx_path="weights/yolov5s-face.onnx", tf_save_path="./yolov5s-face.tf"):
onnx_model = onnx.load(onnx_path)
tf_rep = prepare(onnx_model)
tf_rep.export_graph(tf_save_path)
def export_tflite(tf_save_path="./yolov5s-face.tf",tflite_save_pth="./model.tflite"):
converter = tf.lite.TFLiteConverter.from_saved_model(tf_save_path)
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS,
]
converter.target_spec.supported_types = [tf.float32]
converter.inference_input_type = tf.float32 #tf.uint8 # or tf.int8
converter.inference_output_type = tf.float32 #tf.uint8 # or tf.int8
tflite_model = converter.convert()
with open(tflite_save_pth,'wb') as f:
f.write(tflite_model)
def representative_dataset_gen():
import os
path = "/home/payphone/work/datasets/"
names = os.listdir(path)
files = [os.path.join(path,item) for item in names]
abs_paths = []
for file in files:
abs_paths += [os.path.join(file,item) for item in os.listdir(file)]
exec = [] # len(abs_paths)-1
for item in range(0,50,2):
exec.append(abs_paths[item])
for item in exec:
try:
test_image = cv2.cvtColor(cv2.imread(item), cv2.COLOR_BGR2RGB).astype(np.float32)
input_data = cv2.resize(test_image,dsize=(256,256))/255
yield [np.expand_dims(np.transpose(input_data.astype(np.float32),[2,0,1]),axis=0)]
except:
print(item)
[None]
def export_tflite_uint8(tf_save_path="./yolov5s-face.tf",tflite_save_pth="./model.tflite"):
converter = tf.lite.TFLiteConverter.from_saved_model(tf_save_path)
converter.representative_dataset = lambda: representative_dataset_gen()
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] # tf.lite.OpsSet.SELECT_TF_OPS
converter.target_spec.supported_types = []
converter.inference_input_type = tf.uint8 # or tf.int8
converter.inference_output_type = tf.uint8 # or tf.int8
converter.experimental_new_quantizer = False # 开启这个会量化失败。
converter.experimental_new_converter = True
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops.append(tf.lite.OpsSet.SELECT_TF_OPS)
tflite_model = converter.convert()
with open(tflite_save_pth+str("int8_with_metadata.tflite"),'wb') as f:
f.write(tflite_model)
import contextlib
from pathlib import Path
with contextlib.suppress(ImportError):
# check_requirements('tflite_support')
from tflite_support import flatbuffers
from tflite_support import metadata as _metadata
from tflite_support import metadata_schema_py_generated as _metadata_fb
metadata = "{'stride': 32, 'names': {0: 'person'}}"
tmp_file = Path('/tmp/meta.txt')
with open(tmp_file, 'w') as meta_f:
meta_f.write(str(metadata))
file = "/home/payphone/work/yolov5-face/yolov5-face/yolov5_face.tfliteint8_with_metadata.tflite"
model_meta = _metadata_fb.ModelMetadataT()
label_file = _metadata_fb.AssociatedFileT()
label_file.name = tmp_file.name
model_meta.associatedFiles = [label_file]
subgraph = _metadata_fb.SubGraphMetadataT()
subgraph.inputTensorMetadata = [_metadata_fb.TensorMetadataT()]
subgraph.outputTensorMetadata = [_metadata_fb.TensorMetadataT()] * 1
model_meta.subgraphMetadata = [subgraph]
b = flatbuffers.Builder(0)
b.Finish(model_meta.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
metadata_buf = b.Output()
populator = _metadata.MetadataPopulator.with_model_file(file)
populator.load_metadata_buffer(metadata_buf)
populator.load_associated_files([str(tmp_file)])
populator.populate()
tmp_file.unlink()
def test_tflite_yolo(tflite_save_pth="./model.tflite", image_path = "torch2trt_onnx/zidane.jpg", fp32=True):
interpreter = tf.lite.Interpreter(model_path=tflite_save_pth)
interpreter.allocate_tensors()
#==============================================================
intput_details = interpreter.get_input_details()[0]
output_details = interpreter.get_output_details()[0]
#==============================================================
test_image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
H,W,C = test_image.shape
input_data = np.zeros(shape=(np.max(test_image.shape),np.max(test_image.shape),3))
input_data[0:H,0:W,] = test_image
input_data = cv2.resize(input_data,dsize=(256,256))
if fp32:
input_data = input_data/255
print("fp32~~~")
else:
print("uint8~~~")
input_data = input_data.astype(np.uint8)
input_data = np.transpose(input_data, [2,0,1])
if intput_details['dtype'] == np.uint8:
input_scale, input_zero_point = intput_details["quantization"]
input_data = input_data / input_scale + input_zero_point
input_data = np.expand_dims(input_data, axis=0).astype(intput_details["dtype"])
interpreter.set_tensor(intput_details["index"], input_data)
interpreter.invoke()
output = interpreter.get_tensor(output_details["index"])
if output_details['dtype'] == np.uint8:
output_scale, output_zero_point = output_details["quantization"]
output = (output- output_zero_point)*output_scale
output = non_max_suppression_face(torch.tensor(output), -1, 0.1)
print(output[0].shape)
test_image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
H,W,C = test_image.shape
input_data = np.zeros(shape=(np.max(test_image.shape),np.max(test_image.shape),3))
input_data[0:H,0:W,] = test_image
input_data = cv2.resize(input_data,dsize=(256,256))
for item in output[0]:
cv2.rectangle(input_data, (int(item[0]),int(item[1])), (int(item[2]),int(item[3])), (0,255,0))
if fp32:
cv2.imwrite("yolo_vis_fp32.jpg",input_data)
else:
cv2.imwrite("yolo_vis_uint8.jpg",input_data)
def execute(onnx_path,tf_save_path,tflite_save_pth):
export_tf(onnx_path,tf_save_path)
export_tflite(tf_save_path,tflite_save_pth)
export_tflite_uint8(tf_save_path,tflite_save_pth)
if __name__=="__main__":
onnx_path = "/home/payphone/work/yolov5-face/weights/yolov5s-face.onnx"
tf_save_path = "./yolov5-face"
tflite_save_pth = "./yolov5-face/yolov5_face.tflite"
tflite_with_meta = "/home/payphone/work/yolov5-face/yolov5-face/yolov5_face.tfliteint8_with_metadata.tflite"
export_tflite_uint8(tf_save_path,tflite_save_pth)
test_tflite_yolo(tflite_with_meta,fp32=False)
算子适配问题
tflite 算子适配
- 不支持五维度的通道转换
- 推理出现问题时,对照官方文档查阅模型中是否存在不适配的算子。
tnn 算子适配问题
- 不支持五维度的通道转换(新版本似乎支持了)
- 推理出现问题时,对照官方文档查阅模型中是否存在不适配的算子。
- 社区比较活跃,issue 比较多,可以直接从 issue 中查找可能的答案。
量化相关内容
- https://blog.csdn.net/jin739738709/article/details/113244053
- https://blog.csdn.net/chen1234520nnn/article/details/118543638?utm_medium=distribute.pc_relevant.none-task-blog-2~default~baidujs_baidulandingword~default-1-118543638-blog-127716931.pc_relevant_aa&spm=1001.2101.3001.4242.2&utm_relevant_index=4
YOLO模型转换踩坑
目前已知 tnn、tflite 不支持五个维度的通道转换。需要变成四维的通道转换,但是!在替换的时候一定要注意加上 contiguous()!!论文原版代码里加了!你就要加!这种往往是不希望你更新原有的值,因为原有的值可能要进行一些其他操作,诸如特征融合。下面是一份参考代码。
def forward(self, x):
z = [] # inference output
if self.export_cat:
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
_,_, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# 这样就好了!要 contiguous?
yy = x[i].view(self.na, self.no, ny, nx).permute(0, 2, 3, 1).contiguous()
self.grid[i], self.anchor_grid[i] = self._make_grid_new(nx, ny,i)
yy = yy.sigmoid()
box_xy,box_wh,conf,label = yy[...,:2],yy[...,2:4],yy[...,4:5],yy[...,15:16]
box_xy = (box_xy * 2. - 0.5 + self.grid[i][0].to(x[i].device)) * self.stride[i] # xy
box_wh = (box_wh * 2) ** 2 * self.anchor_grid[i][0] # wh
box_xy = box_xy / 256.
box_wh = box_wh / 256.
y = torch.cat((box_xy,box_wh,conf,label),3)
z.append(y.view(1, -1, 6))
return torch.cat(z, 1)