Python将自己的图片数据集导入h5py,做识别的预处理
很多情况下,在训练卷积神经网络时,需要将自己的图片作为卷积神经网络的输入。
将自己的图片数据集导入h5py,所占空间小,使用方便
条件:自己的图片,eg:cats VS dogs,并将两类图片分别放置于两个文件夹(我这里是yes_tumble与not_tumble)
import os import numpy as np from PIL import Image import tensorflow as tf import matplotlib.pyplot as plt import sklearn from sklearn import preprocessing import h5py import scipy #导入必要的包
def get_files(file_dir): cats = [] label_cats = [] dogs = [] label_dogs = [] for file in os.listdir(file_dir+"/not_tumble"): cats.append(file_dir +"/not_tumble"+"/"+ file) label_cats.append(0) #添加标签,该类标签为0,此为2分类例子,多类别识别问题自行添加 for file in os.listdir(file_dir+"/yes_tumble"): dogs.append(file_dir +"/yes_tumble"+"/"+file) label_dogs.append(1) #把cat和dog合起来组成一个list(img和lab) image_list = np.hstack((cats, dogs)) label_list = np.hstack((label_cats, label_dogs)) #利用shuffle打乱顺序 temp = np.array([image_list, label_list]) temp = temp.transpose() np.random.shuffle(temp) #从打乱的temp中再取出list(img和lab) image_list = list(temp[:, 0]) label_list = list(temp[:, 1]) label_list = [int(i) for i in label_list] return image_list,label_list #返回两个list 分别为图片文件名及其标签 顺序已被打乱
train_dir = "F:/CSISA_Picture" image_list,label_list = get_files(train_dir) print(len(image_list)) print(len(label_list))
#450为数据长度的20% Train_image = np.random.rand(len(image_list)-450, 64, 64, 3).astype("float32") Train_label = np.random.rand(len(image_list)-450, 1).astype("float32") Test_image = np.random.rand(450, 64, 64, 3).astype("float32") Test_label = np.random.rand(450, 1).astype("float32")
for i in range(len(image_list)-450): Train_image[i] = np.array(plt.imread(image_list[i])) Train_label[i] = np.array(label_list[i]) for i in range(len(image_list)-450, len(image_list)): Test_image[i+450-len(image_list)] = np.array(plt.imread(image_list[i])) Test_label[i+450-len(image_list)] = np.array(label_list[i])
# Create a new file f = h5py.File("data.h5", "w") f.create_dataset("X_train", data=Train_image) f.create_dataset("y_train", data=Train_label) f.create_dataset("X_test", data=Test_image) f.create_dataset("y_test", data=Test_label) f.close()
# Load hdf5 dataset train_dataset = h5py.File("data.h5", "r") train_set_x_orig = np.array(train_dataset["X_train"][:]) # your train set features train_set_y_orig = np.array(train_dataset["y_train"][:]) # your train set labels test_set_x_orig = np.array(train_dataset["X_test"][:]) # your train set features test_set_y_orig = np.array(train_dataset["y_test"][:]) # your train set labels f.close()
print(train_set_x_orig.shape) print(train_set_y_orig.shape) print(train_set_x_orig.max()) print(train_set_x_orig.min()) print(test_set_x_orig.shape) print(test_set_y_orig.shape)
#测试 plt.imshow(train_set_x_orig[222]) print(train_set_y_orig[222])
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。