YOLO V5 训练自己的数据集（全网最详细）

1.YOLO V5介绍

YOLOv5-6.0版本的网络可以按照深度和宽度分为五个版本：n、s、m、l和x。在大多数情况下，为了满足模型轻量化设计并保证检测精度，我们选择YOLOv5s作为基础模型进行改进。

YOLOv5主要由四个部分组成：输入端（Input）、主干网络（Backbone）、颈部网络（Neck）和检测端（Head）。这些部分协同工作，使得模型能够高效地进行目标检测。

主干网络是模型的核心部分，负责提取图像的特征信息。颈部网络则将主干网络提取的特征信息进行融合，为检测端提供更加丰富的信息。检测端则负责根据这些特征信息对目标进行定位和分类。

通过选用合适的版本和改进基础模型，YOLOv5可以为用户提供准确、快速的目标检测服务。

源代码：https://github.com/ultralytics/yolov5

其预训练权重可在官方下载

本项目使用YOLOv5s.pt

2.数据集介绍

WiderPerson数据集是一个针对拥挤场景行人检测的基准数据集，其图像来源不再仅限于交通场景，而是从多种场景中精心挑选而来。该数据集包含13382张图像，并附带了约40万个遮挡标记作为注释。为了确保公平性和有效性，我们随机选取了8000张、1000张和4382张图像分别作为训练集、验证集和测试集。与CityPersons和WIDER FACE数据集类似，我们不会发布测试图像的标注文件，以防止潜在的作弊行为。

您可以通过以下网址下载WiderPerson数据集：WiderPerson: A Diverse Dataset for Dense Pedestrian Detection in the Wild。

下载完成之后，其文件夹如下

其中Annotations文件下的txt文件如下，第一行数字为标签数目（没啥用处），前面数字为类别，本数据集内共有五个类别

0 : pedestrians
1 : riders
2 : partially-visible persons
3 : ignore regions
4 : crowd

为把她转成VOC格式文件，需要把这这个txt文件转换成xml文件，代码如下

import os
import numpy as np
import scipy.io as sio
import shutil
from lxml.etree import Element, SubElement, tostring
from xml.dom.minidom import parseString
import cv2


def make_voc_dir():
    # labels 目录若不存在，创建labels目录。若存在，则清空目录
    if not os.path.exists('../VOC2007/Annotations'):
        os.makedirs('../VOC2007/Annotations')
    if not os.path.exists('../VOC2007/ImageSets'):
        os.makedirs('../VOC2007/ImageSets')
        os.makedirs('../VOC2007/ImageSets/Main')
    if not os.path.exists('../VOC2007/JPEGImages'):
        os.makedirs('../VOC2007/JPEGImages')


if __name__ == '__main__':
    classes = {'1': 'pedestrians',
               '2': 'riders',
               '3': 'partially',
               '4': 'ignore',
               '5': 'crowd'}
    VOCRoot = '../VOC2007'
    widerDir = './WiderPerson'  # 数据集所在的路径
    wider_path = './WiderPerson/val.txt'
    make_voc_dir()
    with open(wider_path, 'r') as f:
        imgIds = [x for x in f.read().splitlines()]

    for imgId in imgIds:
        objCount = 0  # 一个标志位，用来判断该img是否包含我们需要的标注
        filename = imgId + '.jpg'
        img_path = './WiderPerson/Images/' + filename
        print('Img :%s' % img_path)
        img = cv2.imread(img_path)
        width = img.shape[1]  # 获取图片尺寸
        height = img.shape[0]  # 获取图片尺寸 360

        node_root = Element('annotation')
        node_folder = SubElement(node_root, 'folder')
        node_folder.text = 'JPEGImages'
        node_filename = SubElement(node_root, 'filename')
        node_filename.text = 'VOC2007/JPEGImages/%s' % filename
        node_size = SubElement(node_root, 'size')
        node_width = SubElement(node_size, 'width')
        node_width.text = '%s' % width
        node_height = SubElement(node_size, 'height')
        node_height.text = '%s' % height
        node_depth = SubElement(node_size, 'depth')
        node_depth.text = '3'

        label_path = img_path.replace('Images', 'Annotations') + '.txt'
        with open(label_path) as file:
            line = file.readline()
            count = int(line.split('\n')[0])  # 里面行人个数
            line = file.readline()
            while line:
                cls_id = line.split(' ')[0]
                xmin = int(line.split(' ')[1]) + 1
                ymin = int(line.split(' ')[2]) + 1
                xmax = int(line.split(' ')[3]) + 1
                ymax = int(line.split(' ')[4].split('\n')[0]) + 1
                line = file.readline()

                cls_name = classes[cls_id]

                obj_width = xmax - xmin
                obj_height = ymax - ymin

                difficult = 0
                if obj_height <= 6 or obj_width <= 6:
                    difficult = 1

                node_object = SubElement(node_root, 'object')
                node_name = SubElement(node_object, 'name')
                node_name.text = cls_name
                node_difficult = SubElement(node_object, 'difficult')
                node_difficult.text = '%s' % difficult
                node_bndbox = SubElement(node_object, 'bndbox')
                node_xmin = SubElement(node_bndbox, 'xmin')
                node_xmin.text = '%s' % xmin
                node_ymin = SubElement(node_bndbox, 'ymin')
                node_ymin.text = '%s' % ymin
                node_xmax = SubElement(node_bndbox, 'xmax')
                node_xmax.text = '%s' % xmax
                node_ymax = SubElement(node_bndbox, 'ymax')
                node_ymax.text = '%s' % ymax
                node_name = SubElement(node_object, 'pose')
                node_name.text = 'Unspecified'
                node_name = SubElement(node_object, 'truncated')
                node_name.text = '0'

        image_path = VOCRoot + '/JPEGImages/' + filename
        xml = tostring(node_root, pretty_print=True)  # 'annotation'
        dom = parseString(xml)
        xml_name = filename.replace('.jpg', '.xml')
        xml_path = VOCRoot + '/Annotations/' + xml_name
        with open(xml_path, 'wb') as f:
            f.write(xml)
        # widerDir = '../WiderPerson'  # 数据集所在的路径
        shutil.copy(img_path, '../VOC2007/JPEGImages/' + filename)

可以用以下代码展示一下数据集

# -*- coding: utf-8 -*-

import os
import cv2

if __name__ == '__main__':
    path = './WiderPerson/train.txt'
    with open(path, 'r') as f:
        img_ids = [x for x in f.read().splitlines()]

    for img_id in img_ids:  # '000040'
        img_path = './WiderPerson/JPEGImages/' + img_id + '.jpg'
        print(img_path)
        img = cv2.imread(img_path)

        im_h = img.shape[0]
        im_w = img.shape[1]
        print(img_path)
        #label_path = img_path.replace('Images', 'Annotations') + '.txt'
        label_path = img_path.replace('JPEGImages', 'Annotations') + '.txt'
        print(label_path)
        with open(label_path) as file:
            line = file.readline()
            count = int(line.split('\n')[0])  # 里面行人个数
            line = file.readline()
            while line:
                cls = int(line.split(' ')[0])
                print(cls)
                # < class_label =1: pedestrians > 行人
                # < class_label =2: riders >      骑车的
                # < class_label =3: partially-visible persons > 遮挡的部分行人
                # < class_label =4: ignore regions > 一些假人，比如图画上的人
                # < class_label =5: crowd > 拥挤人群，直接大框覆盖了
                if cls == 1 or cls == 3:
                    xmin = float(line.split(' ')[1])
                    ymin = float(line.split(' ')[2])
                    xmax = float(line.split(' ')[3])
                    ymax = float(line.split(' ')[4].split('\n')[0])
                    img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
                line = file.readline()
        cv2.imshow('result', img)
        cv2.waitKey(0)

3.数据集处理

用上述代码可以生成以下文件夹

下面划分数据集和验证集，用split_train_val.py

# coding:utf-8

# coding:utf-8

import os
import random
import argparse

parser = argparse.ArgumentParser()
#xml文件的地址，根据自己的数据进行修改 xml一般存放在Annotations下
parser.add_argument('--xml_path', default='./VOC2007/Annotations', type=str, help='input xml label path')
#数据集的划分，地址选择自己数据下的ImageSets/Main
parser.add_argument('--txt_path', default='./VOC2007/ImageSets/Main', type=str, help='output txt label path')
opt = parser.parse_args()

trainval_percent = 1
train_percent = 0.9
xmlfilepath = opt.xml_path
txtsavepath = opt.txt_path
print(xmlfilepath)
total_xml = os.listdir(xmlfilepath)
if not os.path.exists(txtsavepath):
    os.makedirs(txtsavepath)

num = len(total_xml)
list_index = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
trainval = random.sample(list_index, tv)
train = random.sample(trainval, tr)

file_trainval = open(txtsavepath + '/trainval.txt', 'w')
file_test = open(txtsavepath + '/test.txt', 'w')
file_train = open(txtsavepath + '/train.txt', 'w')
file_val = open(txtsavepath + '/val.txt', 'w')

for i in list_index:
    name = total_xml[i][:-4] + '\n'
    if i in trainval:

        file_trainval.write(name)
        if i in train:
            file_train.write(name)
        else:
            file_val.write(name)
    else:
        file_test.write(name)

file_trainval.close()
file_train.close()
file_val.close()
file_test.close()

生成的txt文件如下

再一步，使用voc_labels.py names修改成自己的类别

# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import os
from os import getcwd

sets = ['train', 'val', 'test']
classes = ["pedestrians","riders","partially-visible persons","ignore regions","crowd"]  # 改成自己的类别
abs_path = os.getcwd()
print(abs_path)


def convert(size, box):
    dw = 1. / (size[0])
    dh = 1. / (size[1])
    x = (box[0] + box[1]) / 2.0 - 1
    y = (box[2] + box[3]) / 2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return x, y, w, h


def convert_annotation(image_id):
    in_file = open('D:/V5/VOC2007/Annotations/%s.xml' % (image_id), encoding='UTF-8')
    out_file = open('D:/V5/VOC2007/labels/%s.txt' % (image_id), 'w')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)
    for obj in root.iter('object'):
        # difficult = obj.find('difficult').text
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult) == 1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
             float(xmlbox.find('ymax').text))
        b1, b2, b3, b4 = b
        # 标注越界修正
        if b2 > w:
            b2 = w
        if b4 > h:
            b4 = h
        b = (b1, b2, b3, b4)
        bb = convert((w, h), b)
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')


wd = getcwd()
for image_set in sets:
    if not os.path.exists('D:/V5/VOC2007/labels/'):
        os.makedirs('D:/V5/VOC2007/labels/')
    image_ids = open('D:/V5/VOC2007/ImageSets/Main/%s.txt' % (image_set)).read().strip().split()
    list_file = open('D:/V5/VOC2007/%s.txt' % (image_set), 'w')
    for image_id in image_ids:
        list_file.write('D:/V5/VOC2007/JPEGImages/%s.jpg\n' % (image_id))
        convert_annotation(image_id)
    list_file.close()