YOLO V5 训练自己的数据集(全网最详细)
1.YOLO V5介绍
YOLOv5-6.0版本的网络可以按照深度和宽度分为五个版本:n、s、m、l和x。在大多数情况下,为了满足模型轻量化设计并保证检测精度,我们选择YOLOv5s作为基础模型进行改进。
YOLOv5主要由四个部分组成:输入端(Input)、主干网络(Backbone)、颈部网络(Neck)和检测端(Head)。这些部分协同工作,使得模型能够高效地进行目标检测。
主干网络是模型的核心部分,负责提取图像的特征信息。颈部网络则将主干网络提取的特征信息进行融合,为检测端提供更加丰富的信息。检测端则负责根据这些特征信息对目标进行定位和分类。
通过选用合适的版本和改进基础模型,YOLOv5可以为用户提供准确、快速的目标检测服务。
源代码:https://github.com/ultralytics/yolov5

其预训练权重可在官方下载

本项目使用YOLOv5s.pt
2.数据集介绍
WiderPerson数据集是一个针对拥挤场景行人检测的基准数据集,其图像来源不再仅限于交通场景,而是从多种场景中精心挑选而来。该数据集包含13382张图像,并附带了约40万个遮挡标记作为注释。为了确保公平性和有效性,我们随机选取了8000张、1000张和4382张图像分别作为训练集、验证集和测试集。与CityPersons和WIDER FACE数据集类似,我们不会发布测试图像的标注文件,以防止潜在的作弊行为。
您可以通过以下网址下载WiderPerson数据集:WiderPerson: A Diverse Dataset for Dense Pedestrian Detection in the Wild。
下载完成之后,其文件夹如下

其中Annotations文件下的txt文件如下,第一行数字为标签数目(没啥用处),前面数字为类别,本数据集内共有五个类别
0 : pedestrians
1 : riders
2 : partially-visible persons
3 : ignore regions
4 : crowd

为把她转成VOC格式文件,需要把这这个txt文件转换成xml文件,代码如下
import os
import numpy as np
import scipy.io as sio
import shutil
from lxml.etree import Element, SubElement, tostring
from xml.dom.minidom import parseString
import cv2
def make_voc_dir():
# labels 目录若不存在,创建labels目录。若存在,则清空目录
if not os.path.exists('../VOC2007/Annotations'):
os.makedirs('../VOC2007/Annotations')
if not os.path.exists('../VOC2007/ImageSets'):
os.makedirs('../VOC2007/ImageSets')
os.makedirs('../VOC2007/ImageSets/Main')
if not os.path.exists('../VOC2007/JPEGImages'):
os.makedirs('../VOC2007/JPEGImages')
if __name__ == '__main__':
classes = {'1': 'pedestrians',
'2': 'riders',
'3': 'partially',
'4': 'ignore',
'5': 'crowd'}
VOCRoot = '../VOC2007'
widerDir = './WiderPerson' # 数据集所在的路径
wider_path = './WiderPerson/val.txt'
make_voc_dir()
with open(wider_path, 'r') as f:
imgIds = [x for x in f.read().splitlines()]
for imgId in imgIds:
objCount = 0 # 一个标志位,用来判断该img是否包含我们需要的标注
filename = imgId + '.jpg'
img_path = './WiderPerson/Images/' + filename
print('Img :%s' % img_path)
img = cv2.imread(img_path)
width = img.shape[1] # 获取图片尺寸
height = img.shape[0] # 获取图片尺寸 360
node_root = Element('annotation')
node_folder = SubElement(node_root, 'folder')
node_folder.text = 'JPEGImages'
node_filename = SubElement(node_root, 'filename')
node_filename.text = 'VOC2007/JPEGImages/%s' % filename
node_size = SubElement(node_root, 'size')
node_width = SubElement(node_size, 'width')
node_width.text = '%s' % width
node_height = SubElement(node_size, 'height')
node_height.text = '%s' % height
node_depth = SubElement(node_size, 'depth')
node_depth.text = '3'
label_path = img_path.replace('Images', 'Annotations') + '.txt'
with open(label_path) as file:
line = file.readline()
count = int(line.split('\n')[0]) # 里面行人个数
line = file.readline()
while line:
cls_id = line.split(' ')[0]
xmin = int(line.split(' ')[1]) + 1
ymin = int(line.split(' ')[2]) + 1
xmax = int(line.split(' ')[3]) + 1
ymax = int(line.split(' ')[4].split('\n')[0]) + 1
line = file.readline()
cls_name = classes[cls_id]
obj_width = xmax - xmin
obj_height = ymax - ymin
difficult = 0
if obj_height <= 6 or obj_width <= 6:
difficult = 1
node_object = SubElement(node_root, 'object')
node_name = SubElement(node_object, 'name')
node_name.text = cls_name
node_difficult = SubElement(node_object, 'difficult')
node_difficult.text = '%s' % difficult
node_bndbox = SubElement(node_object, 'bndbox')
node_xmin = SubElement(node_bndbox, 'xmin')
node_xmin.text = '%s' % xmin
node_ymin = SubElement(node_bndbox, 'ymin')
node_ymin.text = '%s' % ymin
node_xmax = SubElement(node_bndbox, 'xmax')
node_xmax.text = '%s' % xmax
node_ymax = SubElement(node_bndbox, 'ymax')
node_ymax.text = '%s' % ymax
node_name = SubElement(node_object, 'pose')
node_name.text = 'Unspecified'
node_name = SubElement(node_object, 'truncated')
node_name.text = '0'
image_path = VOCRoot + '/JPEGImages/' + filename
xml = tostring(node_root, pretty_print=True) # 'annotation'
dom = parseString(xml)
xml_name = filename.replace('.jpg', '.xml')
xml_path = VOCRoot + '/Annotations/' + xml_name
with open(xml_path, 'wb') as f:
f.write(xml)
# widerDir = '../WiderPerson' # 数据集所在的路径
shutil.copy(img_path, '../VOC2007/JPEGImages/' + filename)
可以用以下代码展示一下数据集
# -*- coding: utf-8 -*-
import os
import cv2
if __name__ == '__main__':
path = './WiderPerson/train.txt'
with open(path, 'r') as f:
img_ids = [x for x in f.read().splitlines()]
for img_id in img_ids: # '000040'
img_path = './WiderPerson/JPEGImages/' + img_id + '.jpg'
print(img_path)
img = cv2.imread(img_path)
im_h = img.shape[0]
im_w = img.shape[1]
print(img_path)
#label_path = img_path.replace('Images', 'Annotations') + '.txt'
label_path = img_path.replace('JPEGImages', 'Annotations') + '.txt'
print(label_path)
with open(label_path) as file:
line = file.readline()
count = int(line.split('\n')[0]) # 里面行人个数
line = file.readline()
while line:
cls = int(line.split(' ')[0])
print(cls)
# < class_label =1: pedestrians > 行人
# < class_label =2: riders > 骑车的
# < class_label =3: partially-visible persons > 遮挡的部分行人
# < class_label =4: ignore regions > 一些假人,比如图画上的人
# < class_label =5: crowd > 拥挤人群,直接大框覆盖了
if cls == 1 or cls == 3:
xmin = float(line.split(' ')[1])
ymin = float(line.split(' ')[2])
xmax = float(line.split(' ')[3])
ymax = float(line.split(' ')[4].split('\n')[0])
img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
line = file.readline()
cv2.imshow('result', img)
cv2.waitKey(0)
3.数据集处理
用上述代码可以生成以下文件夹

下面划分数据集和验证集,用split_train_val.py
# coding:utf-8
# coding:utf-8
import os
import random
import argparse
parser = argparse.ArgumentParser()
#xml文件的地址,根据自己的数据进行修改 xml一般存放在Annotations下
parser.add_argument('--xml_path', default='./VOC2007/Annotations', type=str, help='input xml label path')
#数据集的划分,地址选择自己数据下的ImageSets/Main
parser.add_argument('--txt_path', default='./VOC2007/ImageSets/Main', type=str, help='output txt label path')
opt = parser.parse_args()
trainval_percent = 1
train_percent = 0.9
xmlfilepath = opt.xml_path
txtsavepath = opt.txt_path
print(xmlfilepath)
total_xml = os.listdir(xmlfilepath)
if not os.path.exists(txtsavepath):
os.makedirs(txtsavepath)
num = len(total_xml)
list_index = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
trainval = random.sample(list_index, tv)
train = random.sample(trainval, tr)
file_trainval = open(txtsavepath + '/trainval.txt', 'w')
file_test = open(txtsavepath + '/test.txt', 'w')
file_train = open(txtsavepath + '/train.txt', 'w')
file_val = open(txtsavepath + '/val.txt', 'w')
for i in list_index:
name = total_xml[i][:-4] + '\n'
if i in trainval:
file_trainval.write(name)
if i in train:
file_train.write(name)
else:
file_val.write(name)
else:
file_test.write(name)
file_trainval.close()
file_train.close()
file_val.close()
file_test.close()
生成的txt文件如下

再一步,使用voc_labels.py names修改成自己的类别
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import os
from os import getcwd
sets = ['train', 'val', 'test']
classes = ["pedestrians","riders","partially-visible persons","ignore regions","crowd"] # 改成自己的类别
abs_path = os.getcwd()
print(abs_path)
def convert(size, box):
dw = 1. / (size[0])
dh = 1. / (size[1])
x = (box[0] + box[1]) / 2.0 - 1
y = (box[2] + box[3]) / 2.0 - 1
w = box[1] - box[0]
h = box[3] - box[2]
x = x * dw
w = w * dw
y = y * dh
h = h * dh
return x, y, w, h
def convert_annotation(image_id):
in_file = open('D:/V5/VOC2007/Annotations/%s.xml' % (image_id), encoding='UTF-8')
out_file = open('D:/V5/VOC2007/labels/%s.txt' % (image_id), 'w')
tree = ET.parse(in_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
for obj in root.iter('object'):
# difficult = obj.find('difficult').text
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
float(xmlbox.find('ymax').text))
b1, b2, b3, b4 = b
# 标注越界修正
if b2 > w:
b2 = w
if b4 > h:
b4 = h
b = (b1, b2, b3, b4)
bb = convert((w, h), b)
out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
wd = getcwd()
for image_set in sets:
if not os.path.exists('D:/V5/VOC2007/labels/'):
os.makedirs('D:/V5/VOC2007/labels/')
image_ids = open('D:/V5/VOC2007/ImageSets/Main/%s.txt' % (image_set)).read().strip().split()
list_file = open('D:/V5/VOC2007/%s.txt' % (image_set), 'w')
for image_id in image_ids:
list_file.write('D:/V5/VOC2007/JPEGImages/%s.jpg\n' % (image_id))
convert_annotation(image_id)
list_file.close()
四.训练过程
找到data文件夹的xView.yaml文件,复制一份,改成data.yaml文件,里面放自己的类别
修改前

修改后

找到yolov5s.yaml,复制一份改成yolov5s_s.yaml修改其中的nc参数

修改train.py中的参数,weights改成下载的预训练权重,cfg放yolov5s_s.yaml,data放 data.yaml,修改合理的epoch和batch_size,看着自己的显卡来

运行train.py报错,找网上教程自己修改
五.结果显示
训练完成之后,运行detect.py文件,修改参数,weights的权重在runs下面,source更改可以实现图片,摄像头,,视频的检测,别的参数看着修改,一般不修改。

运行结果如下

欢迎交流评论,有啥问题评论区交流