4.如何处理Labelme标注后的数据

软件开发
2025-09-02 05:24:02

正文：

在上一篇文章中我们讲了如何标注数据集，那么我们在本篇文章中来讲一下如何处理这些数据集。下面我会提供相应的代码，大家只需要执行相应的代码即可。

一、将Labelme标注成功后的json格式数据转成yolo可直接使用的txt数据 # -*- coding: utf-8 -*- """ @Auth:ShiGuang @Date:2023-10-30-17:57 """ import json import os def json2yolo(path, filename, classdic): file_path = path + filename data = json.load(open(file_path, encoding="utf-8")) # 读取带有中文的文件 img_w = data["imageWidth"] # 获取json文件里图片的宽高 img_h = data["imageHeight"] all_line = '' for i in data["shapes"]: # 归一化坐标点。并得到cx,cy,w,h [[x1, y1], [x2, y2]] = i['points'] x1, x2 = x1 / img_w, x2 / img_w y1, y2 = y1 / img_h, y2 / img_h cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 # 将数据组装成yolo格式 line = "%s %.4f %.4f %.4f %.4f\n" % (classdic[i["label"]], cx, cy, abs(x2 - x1), abs(y2 - y1)) # 生成txt文件里每行的内容 all_line += line # print(all_line) save_path = path.replace("labels_json", "labels") if not os.path.isdir(save_path): os.makedirs(save_path) filename = (save_path + filename).replace('json', 'txt') # 将path里的json替换成txt,生成txt里相对应的文件路径 with open(filename, 'w', encoding='utf-8') as f: f.write(all_line) path = r"./labels_json/" #这里选择labels_json的路径 path_list2 = [x for x in os.listdir(path) if ".json" in x] # 获取所有json文件的路径 classdic = {"notch": "0"} # 类别字典 for filename in path_list2: json2yolo(path, filename, classdic)

接下来直接使用上面的脚本来处理就可以了，path这里换成标注完成的labels_json文件夹所在的路径，classdic这个地方主要是标签，在上一篇我们标注数据的时候只有一个标签notch，所以这里填写{"notch": "0"}，如果标注的时候有多个标签，那么这里就是{"标签1":0,"标签2":1}这样就可以了。填写完成之后就可以直接运行了，之后在labels_json文件夹同级路径下就能看到一个labels文件夹了，这个里面就是处理的好数据。

二、数据集划分 # -*- coding:utf-8 -*- """ @Author:时光 @date:2023-11-18 9:29 """ import os import random import shutil from tqdm import tqdm def split_images_folder(img_dir, label_dir, save_img_path, img_suffix='.jpg', train_pct=0.8, valid_pct=0.1, test_pct=0): """ 将数据集划分为训练集，验证集，测试集。参数: img_dir: 原图片数据路径 label_dir: 原标签文件路径 save_img_path: 数据集划分后保存的基础路径 img_suffix: 图像文件后缀名 train_pct: 训练集比例 valid_pct: 验证集比例 test_pct: 测试集比例 """ # 验证总和比例正确性 total_pct = train_pct + valid_pct + test_pct if total_pct != 1.0: raise ValueError("训练、验证和测试集的比例之和应为1.0") # 定义输出目录结构的根目录 split_dir = os.path.join(os.path.dirname(save_img_path), "detect") # **新增：删除旧的 detect 文件夹** if os.path.exists(split_dir): print(f"正在删除已有的目录：{split_dir}") shutil.rmtree(split_dir) print(f"目录 {split_dir} 已删除。") # 创建数据集目录结构 subdirs = {'train': train_pct, 'val': valid_pct} # 仅添加“test”目录如果 test_pct 大于 0 if test_pct > 0: subdirs['test'] = test_pct for subdir_name in subdirs.keys(): for data_type in ['images', 'labels']: dir_path = os.path.join(split_dir, subdir_name, data_type) os.makedirs(dir_path, exist_ok=True) # 获取并打乱全部文件列表 imgs = [f for f in os.listdir(img_dir) if f.endswith(img_suffix)] random.shuffle(imgs) # 计算每个子集的边界 img_count = len(imgs) train_end = int(img_count * train_pct) valid_end = train_end + int(img_count * valid_pct) dataset_splits = { 'train': (0, train_end), 'val': (train_end, valid_end), } if test_pct > 0: dataset_splits['test'] = (valid_end, img_count) for split_name, (start_idx, end_idx) in dataset_splits.items(): out_img_dir = os.path.join(split_dir, split_name, 'images') out_label_dir = os.path.join(split_dir, split_name, 'labels') for i in tqdm(range(start_idx, end_idx)): img_name = imgs[i] base_name = os.path.splitext(img_name)[0] src_img_path = os.path.join(img_dir, img_name) src_label_path = os.path.join(label_dir, base_name + '.txt') target_img_path = os.path.join(out_img_dir, img_name) target_label_path = os.path.join(out_label_dir, base_name + '.txt') shutil.copy(src_img_path, target_img_path) if os.path.exists(src_label_path): shutil.copy(src_label_path, target_label_path) # 仅在标签存在时复制 print('完成数据集划分：train:{}, val:{}, test:{}'.format(train_end, valid_end - train_end, img_count - valid_end) if test_pct > 0 else '{} 图片划分为训练集和验证集'.format( img_count)) # 使用函数划分数据集 img_dir = r"./images/" label_dir = r"./labels/" save_img_path = r"./detect" train_pct = 0.9 valid_pct = 0.1 test_pct = 0 # 这里设置测试集的比例，例如 0.0 表示不分配测试集 split_images_folder(img_dir, label_dir, save_img_path, train_pct=train_pct, valid_pct=valid_pct, test_pct=test_pct)

上面的代码很简单，只需要填写对应的路径就可以了，img_dir 也就是我们的图片所在的文件夹路径，label_dir 就是刚才我们转换完后的labels文件夹所在的路径，save_img_path 这个就是最终的输出路径，当这个执行完成之后，就会出现如下几个文件夹：首先我们看到的是detect文件夹，之后我们打开这个文件夹，如下：这两个分别是训练集和验证集，两个文件夹下都是如下：到这里整个数据集就都处理好了。下面我提供一份简单的数据集吧，90张（已经标注好的）

shopee数据集

好了，本篇文章到此结束，下一篇文章我们讲如何使用yolo来训练我们的验证码识别模型

标签：

4.如何处理Labelme标注后的数据由讯客互联软件开发栏目发布，感谢您对讯客互联的认可，以及对我们原创作品以及文章的青睐，非常欢迎各位朋友分享到个人网站或者朋友圈，但转载请说明文章出处“4.如何处理Labelme标注后的数据”

上一篇
深圳SMT贴片加工厂家核心技术及服务优势解析

下一篇
2025-2-18-4.7二叉树（基础题）