Skip to the content.

高质量博客

加速模型推理

ONNX

将 PyTorch 模型导出为 ONNX 模型进行推理

import torch
from models import get_model_resnet
 
def model_converter():
    weight = torch.load('xxx/best34.pt')
    model = get_model_resnet(net_name="resnet34")
    model.load_state_dict(weight)
    model.eval()
    # 静态输入,数据的shape是固定的,固定数据shape是根据任务来的,这里做的是一个分级任务,我就固定shape了
    dummy_input = torch.randn(1, 3, 720, 720, device='cpu')
    input_names = ['data']
    output_names = ['output']
    torch.onnx.export(model, dummy_input, 'resnet34.onnx', 
                      export_params=True, 
                      verbose=True, 
                      input_names=input_names, 
                      output_names=output_names)

model_converter()

使用 ONNX 加载模型进行推理

from time import time
import onnxruntime #  此处用的 cpu 版本
import cv2
import numpy as np
import time

def softmax(x):
    x -= np.max(x, axis= 1, keepdims=True)
    f_x = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
    return f_x

def predict():
    for i in range(0,200):
        image = cv2.imread('xxx/163.png')
        image = cv2.resize(image,(720,720)) # 与导出的onnx的input保持一致
        height,width = image.shape[:2]# H,W,C
        image = image/np.max(image)
        tmpImg = np.zeros((height,width,3))
        start = time.time()
        tmpImg[:,:,0] = (image[:,:,2]-0.406)/0.225
        tmpImg[:,:,1] = (image[:,:,1]-0.456)/0.224
        tmpImg[:,:,2] = (image[:,:,0]-0.485)/0.229

        tmpImg = tmpImg.transpose((2, 0, 1)).astype(np.float32)# HWC->CHW
        tmpImg = tmpImg[np.newaxis,:,:,:]# CHW->NCHW

        # 推理
        # providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
        providers=[ 'CPUExecutionProvider']

        onnx_session = onnxruntime.InferenceSession('/xxx/resnet34.onnx',providers=providers)
        onnx_input = {'data':tmpImg}
        onnx_output = onnx_session.run(['output'],onnx_input)[0]
        prob = softmax(onnx_output) # softmax 转换为置信度值
        print(time.time()-start) #   计算推理时间

# 从 0.67~0.61 优化到了 0.42~0.31
predict()

如果要用 GPU ONNX 进行推理的话,需要安装 GPU 版本,pip install onnxruntime-gpu

TensorRT

直接用库。https://github.com/NVIDIA-AI-IOT/torch2trt

import torchvision
import torch
import time
from torch2trt import torch2trt

model = torchvision.models.resnet34(pretrained=False).cuda().half().eval()
data = torch.randn((1, 3, 224, 224)).cuda().half()
print("start trt")
tmp = time.time()
model_trt = torch2trt(model, [data], fp16_mode=True)
print(f"finish trt:{time.time()-tmp}")

trt_start = time.time()
output_trt = model_trt(data) # tensorrt 0.001
print(time.time()-trt_start)

model_start = time.time()
output = model(data)
print(time.time()-model_start) # pytorch 0.005

torch.save(model_trt.state_dict(), 'resnet18_trt.pth')
print(output.flatten()[0:10])
print(output_trt.flatten()[0:10])
print('max error: %f' % float(torch.max(torch.abs(output - output_trt))))

NumPy加速

pip install numba

@numba.njit 装饰在方法上,可以对方法内的 numpy 进行加速

TNN

手机端的模型部署。支持大多数算子。但是目前还不支持五维度的 cat、permute 等操作。

ONNX2TNN

ONNX 到 TNN 的转换。

  1. 创建 conda 环境,安装如下库。注意要指定 protobuf 的版本为 3.20.x 及以下。如果其他库提示版本过高或过低,按要求换版本即可。

     pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier protobuf==3.20.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
    
  2. 在 Ubuntu 中拉取 TNN 库,下面的是 gitee 的克隆仓库。

     git clone https://gitee.com/atari/TNN.git
    
  3. 进入 TNN/tools/onnx2tnn/onnx-converter 目录,编译

     cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
     ./build.sh 
    
  4. 使用 onnx-converter 目录下的 onnx2tnn.py 进行模型转换

     # 查看帮助信息
     python onnx2tnn.py -h 
     # 例如,将 yolov5s-face.onnx 转为 tnn,输入大小为 1,3,640,640, 开启优化
     python onnx2tnn.py yolov5s-face.onnx -version=v1.0 -optimize=1 -o ./ -input_shape input:1,3,640,640
    

5.转换后将生成的模型和数据拉入 natron 中看看是否正确。

注意!切片操作在可视化模型图的名称应当是 StridedSliceV2,之前我转的模型是 StridedSlice,结果运行一直报错。

TNN项目构建

。。。暂定

TNN调试

ONNX

onnx2trt

用代码从 onnx2trt 太慢了。这里用官方的工具来做。

git clone --recursive -b 8.2-EA https://github.com/onnx/onnx-tensorrt.git

cd onnx-tensorrt
mkdir build
cd build
# /path/to/TensorRT-8.2.4.2改成自己的TensorRT绝对路径
cmake .. -DTENSORRT_ROOT=/path/to/TensorRT-8.2.4.2
make -j8
make install

torch2onnx

torch 导出为 ONNX 模型

import argparse
import sys
import time
import os

import numpy as np
import cv2

sys.path.append('./')  

import torch
import torch.nn as nn

import models
from models.experimental import attempt_load
from utils.activations import Hardswish, SiLU
import onnx


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str, default='/home/payphone/work/yolov5-face/weights/yolov5s-face.pt', help='weights path')  # from yolov5/models/
    parser.add_argument('--batch_size', type=int, default=1, help='batch size')
    parser.add_argument('--dynamic', action='store_true', default=False, help='enable dynamic axis in onnx model')
    parser.add_argument('--onnx_infer', action='store_true', default=True, help='onnx infer test')

    opt = parser.parse_args()

    # Load PyTorch model
    model = attempt_load(opt.weights, map_location=torch.device('cpu'))  # load FP32 model
    delattr(model.model[-1], 'anchor_grid') # yolo 导出模型时都做了这个操作
    model.model[-1].anchor_grid=[torch.zeros(1)] * 3 # nl=3 number of detection layers
    model.model[-1].export_cat = True
    model.eval()
    labels = model.names

   
    #====================================
    img0 = cv2.imread("/home/payphone/work/yolov5-face/data/images/bus.jpg")
    H,W,C = img0.shape
    origin = np.zeros(shape=(np.max(img0.shape),np.max(img0.shape),3))
    origin[0:H,0:W,] = img0

    origin = cv2.resize(origin,dsize=(640,640))
    padding_img = origin.transpose(2, 0, 1).copy()
    padding_img = torch.from_numpy(padding_img).float() / 255.0
    padding_img = padding_img.unsqueeze(0)
    #====================================

    # Update model
	# 省略一些模型的更新操作

    y = model(img)
    model.fuse()  # only for ONNX
    
    f = "export_model.onnx"
    
    input_names=['input']
    output_names=['output']
    torch.onnx.export(model, img, f, verbose=False, opset_version=12, 
        input_names=input_names,
        output_names=output_names,
        dynamic_axes = {'input': {0: 'batch'},
                        'output': {0: 'batch'}
                        } if opt.dynamic else None)

    onnx_model = onnx.load(f)  
    onnx.checker.check_model(onnx_model)  # check onnx model
    #===============================================================================
    # 								 简化 ONNX 模型						 			#
    #===============================================================================
    from onnxsim import simplify
    onnx_model, check = simplify(onnx_model)
    assert check, "Simplified ONNX model could not be validated"
    onnx.save(onnx_model, f)


    # # onnx infer
    if opt.onnx_infer:
        import onnxruntime
        import numpy as np
        providers =  ['CPUExecutionProvider']
        session = onnxruntime.InferenceSession(f, providers=providers)
        padding_img = padding_img.cpu().numpy().astype(np.float32) # torch to numpy
        # [session.get_outputs()[0].name], 为输出数据的名称,可以是一个,也可以是多个,根据网络模型
        # 实际的输出情况进行设置,前面 output_name 怎么设置的这里推理的时候 output_name 要对应
        y_onnx = session.run([session.get_outputs()[0].name], {session.get_inputs()[0].name: padding_img})[0]

修改 onnx batch

有些 onnx 文件 batch size 设置的动态输入,转 trt 的时候会有点问题,可以自己修改 onnx 的 batch size。如果发现某些分支的 batch size 没有修改就看看是不是遍历的时候忽略了它(比如用到的 key name 不一致)

def change_input_dim(model,):
    batch_size = 1
    inputs = model.graph.input
    for input in inputs:
        dim1 = input.type.tensor_type.shape.dim[0]
        if isinstance(batch_size, str):
            dim1.dim_param = batch_size
        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
            dim1.dim_value = int(batch_size)
        else:
            dim1.dim_value = 1

def apply(transfrom, infile, outfile):
    model = onnx.load(infile)
    transfrom(model,)
    onnx.save(model,outfile)
    
if __name__=='__main__':
    source = '/xxx/face_mesh_Nx3x192x192_post.onnx'
    target = "./face_mesh_1x3x192x192_post.onnx"
    apply(change_input_dim, source, target)

删除 onnx 分支

ONNX删除节点示例(Deeplabv3plus)_DazeAJD的博客-CSDN博客_onnx删除节点

TensorRT

TensorRT 模型转换

可以直接使用官方的 trt.exe 工具转换,也可以自己写代码转换。先将模型导出为 ONNX,再转 TRT。下面代码为 trt8.x 版本的导出代码

from re import X
import pycuda.driver as cuda
from pip import main
import pycuda.autoinit
import pycuda.driver as cuda

# 转 tensorrt 成功!tensorrt 版本 8.x cuda11.3 cudnn 未知,,,
import tensorrt as trt
import numpy as np

from utils.general import non_max_suppression_face
import tensorrt as trt
import numpy as np

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def GiB(val):
    return val * 1 << 30

def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False):
    """
    仅适用TensorRT V8版本
    生成cudaEngine,并保存引擎文件(仅支持固定输入尺度)  
    
    fp16_mode: True则fp16预测
    onnx_model_path: 将加载的onnx权重路径
    trt_engine_path: trt引擎文件保存路径
    """
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    config = builder.create_builder_config()
    config.max_workspace_size=GiB(1) 
    if fp16_mode:
        config.set_flag(trt.BuilderFlag.FP16) 

    with open(onnx_model_path, 'rb') as model:
        assert parser.parse(model.read())
        serialized_engine=builder.build_serialized_network(network, config)

    #tensorrt                  7.2.3.4
    with open(trt_engine_path, 'wb') as f:
        f.write(serialized_engine)  # 序列化

    print('TensorRT file in ' + trt_engine_path)
    print('============ONNX->TensorRT SUCCESS============')

class TrtModel():
    '''
    TensorRT infer
    '''
    def __init__(self, trt_path):
        self.ctx=cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(trt_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
    
    def __call__(self,img_np_nchw):
        '''
        TensorRT推理
        :param img_np_nchw: 输入图像
        '''
        self.ctx.push()

        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # img_np_nchw.shape = (1, 3, 640, 640) 展平为一维度 (1228800,)
        np.copyto(host_inputs[0], img_np_nchw.ravel())

        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        stream.synchronize()
        self.ctx.pop()
        
        return host_outputs

    def destroy(self):
        self.ctx.pop()


if __name__ =="__main__":
    import cv2
    import torch

    path = "/home/payphone/work/yolov5-face/data/images/zidane.jpg"
    img = cv2.imread(path)
    img = cv2.resize(img,dsize=(640,640))/255
    img = np.transpose(img,(2,0,1))
    img = np.expand_dims(img,0)
    trt = TrtModel("/home/payphone/work/yolov5-face/weights/yolov5s-face-fp32.trt")

    ans = trt(img)
    ans = np.reshape(ans,(1,-1,16))
    ans = torch.tensor(ans).cuda()
    pred = non_max_suppression_face(ans, 0.1, 0.1)

    origin = cv2.imread(path)
    origin = cv2.resize(origin,dsize=(640,640))
    for result in pred[0]:
        x1 = int(result[0])
        y1 = int(result[1])
        x2 = int(result[2])
        y2 = int(result[3])
        d = cv2.rectangle(origin,(x1,y1),(x2,y2),(0,255,0),thickness=2)
    cv2.imwrite("dd.jpg",d)
    trt.ctx.pop()

TFLite

TFLite 定点模型问题

精度下降

输出结果异常

在转换 yolo 模型时出现 score 的值单一化,要么都是 0 要么都是大于 1。

yolo 模型的输出值应该是 (xywh, score, classes)。xywh 是大于 1 的数 score 的值要么是 0,要么都大于 1(比如都是 14.5),这种情况是没有做归一化,将 score 和 xywh 归一化到一个尺度范围内,如(0,1)即可。

tflite 注意事项

转TFLite

tflite2onnx 的库 onnx/tensorflow-onnx: Convert TensorFlow, Keras, Tensorflow.js and Tflite models to ONNX (github.com)

torch ==> tflite

onnx==> tflite

from onnx_tf.backend import prepare
import onnx
import tensorflow as tf
import torch
import numpy as np
import cv2
import sys
from PIL import Image

sys.path.append('./')  

from utils.general import  non_max_suppression_face

def export_tf(onnx_path="weights/yolov5s-face.onnx", tf_save_path="./yolov5s-face.tf"):
    onnx_model = onnx.load(onnx_path)
    tf_rep = prepare(onnx_model)
    tf_rep.export_graph(tf_save_path)


def export_tflite(tf_save_path="./yolov5s-face.tf",tflite_save_pth="./model.tflite"):
    converter = tf.lite.TFLiteConverter.from_saved_model(tf_save_path)
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,
        tf.lite.OpsSet.SELECT_TF_OPS,
    ]
    converter.target_spec.supported_types = [tf.float32]

    converter.inference_input_type =  tf.float32 #tf.uint8  # or tf.int8
    converter.inference_output_type = tf.float32 #tf.uint8  # or tf.int8

    tflite_model = converter.convert()
    with open(tflite_save_pth,'wb') as f:
        f.write(tflite_model)

def representative_dataset_gen():
    import os
    path = "/home/payphone/work/datasets/"
    names = os.listdir(path)
    files = [os.path.join(path,item) for item in names]
    abs_paths = []
    for file in files:
        abs_paths += [os.path.join(file,item) for item in os.listdir(file)]
    
    exec = [] # len(abs_paths)-1
    for item in range(0,50,2):
        exec.append(abs_paths[item])
    for item in exec:
        try:
            test_image = cv2.cvtColor(cv2.imread(item), cv2.COLOR_BGR2RGB).astype(np.float32)

            input_data = cv2.resize(test_image,dsize=(256,256))/255
            yield [np.expand_dims(np.transpose(input_data.astype(np.float32),[2,0,1]),axis=0)]
        except:
            print(item)
            [None]

def export_tflite_uint8(tf_save_path="./yolov5s-face.tf",tflite_save_pth="./model.tflite"):
    converter = tf.lite.TFLiteConverter.from_saved_model(tf_save_path)

    converter.representative_dataset = lambda: representative_dataset_gen()
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]  # tf.lite.OpsSet.SELECT_TF_OPS
    converter.target_spec.supported_types = []
    converter.inference_input_type = tf.uint8  # or tf.int8
    converter.inference_output_type = tf.uint8 # or tf.int8
    converter.experimental_new_quantizer = False    # 开启这个会量化失败。
    converter.experimental_new_converter = True
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops.append(tf.lite.OpsSet.SELECT_TF_OPS)
    tflite_model = converter.convert()
    with open(tflite_save_pth+str("int8_with_metadata.tflite"),'wb') as f:
        f.write(tflite_model)
    
    import contextlib
    from pathlib import Path
    with contextlib.suppress(ImportError):
        # check_requirements('tflite_support')
        from tflite_support import flatbuffers
        from tflite_support import metadata as _metadata
        from tflite_support import metadata_schema_py_generated as _metadata_fb
        metadata = "{'stride': 32, 'names': {0: 'person'}}"
        tmp_file = Path('/tmp/meta.txt')
        with open(tmp_file, 'w') as meta_f:
            meta_f.write(str(metadata))

        file = "/home/payphone/work/yolov5-face/yolov5-face/yolov5_face.tfliteint8_with_metadata.tflite"
        model_meta = _metadata_fb.ModelMetadataT()
        label_file = _metadata_fb.AssociatedFileT()
        label_file.name = tmp_file.name
        model_meta.associatedFiles = [label_file]

        subgraph = _metadata_fb.SubGraphMetadataT()
        subgraph.inputTensorMetadata = [_metadata_fb.TensorMetadataT()]
        subgraph.outputTensorMetadata = [_metadata_fb.TensorMetadataT()] * 1
        model_meta.subgraphMetadata = [subgraph]

        b = flatbuffers.Builder(0)
        b.Finish(model_meta.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
        metadata_buf = b.Output()

        populator = _metadata.MetadataPopulator.with_model_file(file)
        populator.load_metadata_buffer(metadata_buf)
        populator.load_associated_files([str(tmp_file)])
        populator.populate()
        tmp_file.unlink()


def test_tflite_yolo(tflite_save_pth="./model.tflite", image_path = "torch2trt_onnx/zidane.jpg", fp32=True):
    interpreter = tf.lite.Interpreter(model_path=tflite_save_pth)
    interpreter.allocate_tensors()

    #==============================================================
    intput_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]
    #==============================================================

    test_image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)

    H,W,C = test_image.shape
    input_data = np.zeros(shape=(np.max(test_image.shape),np.max(test_image.shape),3))
    input_data[0:H,0:W,] = test_image
    input_data = cv2.resize(input_data,dsize=(256,256))

    if fp32:
        input_data = input_data/255
        print("fp32~~~")
    else:
        print("uint8~~~")
        input_data = input_data.astype(np.uint8)

    input_data = np.transpose(input_data, [2,0,1])

    if intput_details['dtype'] == np.uint8:
        input_scale, input_zero_point = intput_details["quantization"]
        input_data = input_data / input_scale + input_zero_point

    input_data = np.expand_dims(input_data, axis=0).astype(intput_details["dtype"])
    interpreter.set_tensor(intput_details["index"], input_data)
    interpreter.invoke()
    output = interpreter.get_tensor(output_details["index"])

    if output_details['dtype'] == np.uint8:
        output_scale, output_zero_point = output_details["quantization"]
        output = (output- output_zero_point)*output_scale

    output = non_max_suppression_face(torch.tensor(output), -1, 0.1)

    print(output[0].shape)

    test_image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)

    H,W,C = test_image.shape
    input_data = np.zeros(shape=(np.max(test_image.shape),np.max(test_image.shape),3))
    input_data[0:H,0:W,] = test_image
    input_data = cv2.resize(input_data,dsize=(256,256))

    for item in output[0]:
        cv2.rectangle(input_data,  (int(item[0]),int(item[1])),  (int(item[2]),int(item[3])), (0,255,0))

    if fp32:
        cv2.imwrite("yolo_vis_fp32.jpg",input_data)
    else:
        cv2.imwrite("yolo_vis_uint8.jpg",input_data)


def execute(onnx_path,tf_save_path,tflite_save_pth):
    export_tf(onnx_path,tf_save_path)
    export_tflite(tf_save_path,tflite_save_pth)
    export_tflite_uint8(tf_save_path,tflite_save_pth)



if __name__=="__main__":

    onnx_path = "/home/payphone/work/yolov5-face/weights/yolov5s-face.onnx"
    tf_save_path = "./yolov5-face"
    tflite_save_pth = "./yolov5-face/yolov5_face.tflite"

    tflite_with_meta = "/home/payphone/work/yolov5-face/yolov5-face/yolov5_face.tfliteint8_with_metadata.tflite"

    export_tflite_uint8(tf_save_path,tflite_save_pth)
    test_tflite_yolo(tflite_with_meta,fp32=False)

算子适配问题

tflite 算子适配

tnn 算子适配问题

量化相关内容

YOLO模型转换踩坑

目前已知 tnn、tflite 不支持五个维度的通道转换。需要变成四维的通道转换,但是!在替换的时候一定要注意加上 contiguous()!!论文原版代码里加了!你就要加!这种往往是不希望你更新原有的值,因为原有的值可能要进行一些其他操作,诸如特征融合。下面是一份参考代码。

def forward(self, x):
    z = []  # inference output
    if self.export_cat:
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            _,_, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            # 这样就好了!要 contiguous?
            yy = x[i].view(self.na, self.no, ny, nx).permute(0, 2, 3, 1).contiguous()

            self.grid[i], self.anchor_grid[i] = self._make_grid_new(nx, ny,i)
            yy = yy.sigmoid()
            box_xy,box_wh,conf,label = yy[...,:2],yy[...,2:4],yy[...,4:5],yy[...,15:16]

            box_xy = (box_xy * 2. - 0.5 + self.grid[i][0].to(x[i].device)) * self.stride[i] # xy
            box_wh = (box_wh * 2) ** 2 * self.anchor_grid[i][0] # wh
            box_xy = box_xy / 256.
            box_wh = box_wh / 256.

            y = torch.cat((box_xy,box_wh,conf,label),3)
            z.append(y.view(1, -1, 6))
        return torch.cat(z, 1)