caffe中需要将数据存储为lmdb、hdf5等数据库格式,caffe自带的convert_image工具可以完成lmdb数据格式的转换,但是遇到浮点数,这个工具就不行了,所以,只有存成hdf5了。对于python存储hdf5,本文分析多个demo的数据生成代码,并作出总结(主要就是贴代码了):
首先来看几个生成hdf5的demo,(代码并不完整,这里只给出和存储数据有关部分,完整代码参见本人之前的博客)这是之前安德烈花卉的demo,这个demo中,要存储的数据data是(150,1,1,4)的四维结构,label是(150,3)的二维结构,当然了,第一维是样本,需要一一对应,我们来看存储数据的python代码:
''' Requirements: sudo pip install pydot sudo apt-get install -y graphviz Interesting resources on Caffe: - https://github.com/BVLC/caffe/tree/master/examples - http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb Interesting resources on Iris with ANNs: - iris data set test bed: http://deeplearning4j.org/iris-flower-dataset-tutorial.html - http://se.mathworks.com/help/nnet/examples/iris-clustering.html - http://lab.fs.uni-lj.si/lasin/wp/IMIT_files/neural/doc/seminar8.pdf Synonyms: - output = label = target - input = feature ''' import subprocess import platform import copy from sklearn.datasets import load_iris import sklearn.metrics import numpy as np from sklearn.cross_validation import StratifiedShuffleSplit import matplotlib.pyplot as plt import h5py import caffe import caffe.draw def load_data(): ''' Load Iris Data set ''' data = load_iris() print(data.data) print(data.target) targets = np.zeros((len(data.target), 3)) for count, target in enumerate(data.target): targets[count][target]= 1 print(targets) new_data = {} #new_data['input'] = data.data new_data['input'] = np.reshape(data.data, (150,1,1,4)) new_data['output'] = targets #print(new_data['input'].shape) #new_data['input'] = np.random.random((150, 1, 1, 4)) #print(new_data['input'].shape) #new_data['output'] = np.random.random_integers(0, 1, size=(150,3)) #print(new_data['input']) return new_data def save_data_as_hdf5(hdf5_data_filename, data): ''' HDF5 is one of the data formats Caffe accepts ''' with h5py.File(hdf5_data_filename, 'w') as f: f['data'] = data['input'].astype(np.float32) f['label'] = data['output'].astype(np.float32) def main(): ''' This is the main function ''' # Set parameters solver_prototxt_filename = 'iris_solver.prototxt' train_test_prototxt_filename = 'iris_train_test.prototxt' deploy_prototxt_filename = 'iris_deploy.prototxt' deploy_prototxt_filename = 'iris_deploy.prototxt' deploy_prototxt_batch2_filename = 'iris_deploy_batchsize2.prototxt' hdf5_train_data_filename = 'iris_train_data.hdf5' hdf5_test_data_filename = 'iris_test_data.hdf5' caffemodel_filename = 'iris__iter_1000000.caffemodel' # generated by train() # Prepare data data = load_data() print(data) train_data = data test_data = data save_data_as_hdf5(hdf5_train_data_filename, data) save_data_as_hdf5(hdf5_test_data_filename, data) # Train network train(solver_prototxt_filename) # Get predicted outputs input = np.array([[ 5.1, 3.5, 1.4, 0.2]]) print(get_predicted_output(deploy_prototxt_filename, caffemodel_filename, input)) input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]]) #print(get_predicted_output(deploy_prototxt_batch2_filename, caffemodel_filename, input)) # Print network print_network(deploy_prototxt_filename, caffemodel_filename) print_network(train_test_prototxt_filename, caffemodel_filename) print_network_weights(train_test_prototxt_filename, caffemodel_filename) # Compute performance metrics #inputs = input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]]) inputs = data['input'] outputs = get_predicted_outputs(deploy_prototxt_filename, caffemodel_filename, inputs) get_accuracy(data['output'], outputs) if __name__ == "__main__": main() #cProfile.run('main()') # if you want to do some profiling
上面的是浮点型data,类别数组型label接下来,我们看一个,和人脸特征点检测有关的多标签数据hdf5生成工具,下面这段代码同样不完整,我们就来看一看:
import numpy as np import pandas as pd from numpy import genfromtxt from numpy import ravel import pylab as pl from skimage import transform import h5py from sklearn import cross_validation import uuid import random from skimage import io, exposure, img_as_uint, img_as_float from numpy import (array, dot, arccos) from numpy.linalg import norm #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,labels, test_size=0.30) X_train = X[:1600] y_train = y[:1600] X_test = X[1600:] y_test = y[1600:] print 'Train, Test shapes (X,y):', X_train.shape, y_train.shape, X_test.shape, y_test.shape # Train data f = h5py.File("facialkp-train.hd5", "w") f.create_dataset("data", data=X_train, compression="gzip", compression_opts=4) f.create_dataset("label", data=y_train, compression="gzip", compression_opts=4) f.close() #Test data f = h5py.File("facialkp-test.hd5", "w") f.create_dataset("data", data=X_test, compression="gzip", compression_opts=4) f.create_dataset("label", data=y_test, compression="gzip", compression_opts=4) f.close()
到这里,我们大概就明白了,python中hdf5文件的写入非常简单,我们只需为标签指定存储到label位置,数据存储到data位置,就可以被接收了,注意label是二维而data是四维,就像这样:
def save_data_as_hdf5(hdf5_data_filename, data): ''' HDF5 is one of the data formats Caffe accepts ''' with h5py.File(hdf5_data_filename, 'w') as f: f['data'] = data['input'].astype(np.float32) f['label'] = data['output'].astype(np.float32)
博主由于使用需求比较特别,在windows平台采集数据,存储为npy文件,然后在linux中加载npy文件,将其转储为hdf5文件,下面给出博主的源码:
#!/usr/bin/env python import numpy as np import h5py def save_data_as_hdf5(hdf5_data_filename,np_filename): ''' HDF5 is one of the data formats Caffe accepts example: def save_data_as_hdf5(hdf5_data_filename, data) with h5py.File(hdf5_data_filename, 'w') as f: f['data'] = data['input'].astype(np.float32) f['label'] = data['output'].astype(np.float32) ''' with h5py.File(hdf5_data_filename, 'w') as f: f['label'] = np.load(np_filename) def main(): save_data_as_hdf5("train_label.hdf5","/usr/database/train_labels.npy") save_data_as_hdf5("test_label.hdf5","/usr/database/test_labels.npy") if __name__ == '__main__': main()
是不是感觉非常简单呀~
OK, See You Next Chapter!