caffe中需要将数据存储为lmdb、hdf5等数据库格式,caffe自带的convert_image工具可以完成lmdb数据格式的转换,但是遇到浮点数,这个工具就不行了,所以,只有存成hdf5了。对于python存储hdf5,本文分析多个demo的数据生成代码,并作出总结(主要就是贴代码了):
首先来看几个生成hdf5的demo,(代码并不完整,这里只给出和存储数据有关部分,完整代码参见本人之前的博客)这是之前安德烈花卉的demo,这个demo中,要存储的数据data是(150,1,1,4)的四维结构,label是(150,3)的二维结构,当然了,第一维是样本,需要一一对应,我们来看存储数据的python代码:
'''
Requirements:
sudo pip install pydot
sudo apt-get install -y graphviz
Interesting resources on Caffe:
- https://github.com/BVLC/caffe/tree/master/examples
- http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb
Interesting resources on Iris with ANNs:
- iris data set test bed: http://deeplearning4j.org/iris-flower-dataset-tutorial.html
- http://se.mathworks.com/help/nnet/examples/iris-clustering.html
- http://lab.fs.uni-lj.si/lasin/wp/IMIT_files/neural/doc/seminar8.pdf
Synonyms:
- output = label = target
- input = feature
'''
import subprocess
import platform
import copy
from sklearn.datasets import load_iris
import sklearn.metrics
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import h5py
import caffe
import caffe.draw
def load_data():
'''
Load Iris Data set
'''
data = load_iris()
print(data.data)
print(data.target)
targets = np.zeros((len(data.target), 3))
for count, target in enumerate(data.target):
targets[count][target]= 1
print(targets)
new_data = {}
#new_data['input'] = data.data
new_data['input'] = np.reshape(data.data, (150,1,1,4))
new_data['output'] = targets
#print(new_data['input'].shape)
#new_data['input'] = np.random.random((150, 1, 1, 4))
#print(new_data['input'].shape)
#new_data['output'] = np.random.random_integers(0, 1, size=(150,3))
#print(new_data['input'])
return new_data
def save_data_as_hdf5(hdf5_data_filename, data):
'''
HDF5 is one of the data formats Caffe accepts
'''
with h5py.File(hdf5_data_filename, 'w') as f:
f['data'] = data['input'].astype(np.float32)
f['label'] = data['output'].astype(np.float32)
def main():
'''
This is the main function
'''
# Set parameters
solver_prototxt_filename = 'iris_solver.prototxt'
train_test_prototxt_filename = 'iris_train_test.prototxt'
deploy_prototxt_filename = 'iris_deploy.prototxt'
deploy_prototxt_filename = 'iris_deploy.prototxt'
deploy_prototxt_batch2_filename = 'iris_deploy_batchsize2.prototxt'
hdf5_train_data_filename = 'iris_train_data.hdf5'
hdf5_test_data_filename = 'iris_test_data.hdf5'
caffemodel_filename = 'iris__iter_1000000.caffemodel' # generated by train()
# Prepare data
data = load_data()
print(data)
train_data = data
test_data = data
save_data_as_hdf5(hdf5_train_data_filename, data)
save_data_as_hdf5(hdf5_test_data_filename, data)
# Train network
train(solver_prototxt_filename)
# Get predicted outputs
input = np.array([[ 5.1, 3.5, 1.4, 0.2]])
print(get_predicted_output(deploy_prototxt_filename, caffemodel_filename, input))
input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]])
#print(get_predicted_output(deploy_prototxt_batch2_filename, caffemodel_filename, input))
# Print network
print_network(deploy_prototxt_filename, caffemodel_filename)
print_network(train_test_prototxt_filename, caffemodel_filename)
print_network_weights(train_test_prototxt_filename, caffemodel_filename)
# Compute performance metrics
#inputs = input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]])
inputs = data['input']
outputs = get_predicted_outputs(deploy_prototxt_filename, caffemodel_filename, inputs)
get_accuracy(data['output'], outputs)
if __name__ == "__main__":
main()
#cProfile.run('main()') # if you want to do some profiling
上面的是浮点型data,类别数组型label接下来,我们看一个,和人脸特征点检测有关的多标签数据hdf5生成工具,下面这段代码同样不完整,我们就来看一看:
import numpy as np
import pandas as pd
from numpy import genfromtxt
from numpy import ravel
import pylab as pl
from skimage import transform
import h5py
from sklearn import cross_validation
import uuid
import random
from skimage import io, exposure, img_as_uint, img_as_float
from numpy import (array, dot, arccos)
from numpy.linalg import norm
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,labels, test_size=0.30)
X_train = X[:1600]
y_train = y[:1600]
X_test = X[1600:]
y_test = y[1600:]
print 'Train, Test shapes (X,y):', X_train.shape, y_train.shape, X_test.shape, y_test.shape
# Train data
f = h5py.File("facialkp-train.hd5", "w")
f.create_dataset("data", data=X_train, compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_train, compression="gzip", compression_opts=4)
f.close()
#Test data
f = h5py.File("facialkp-test.hd5", "w")
f.create_dataset("data", data=X_test, compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_test, compression="gzip", compression_opts=4)
f.close()
到这里,我们大概就明白了,python中hdf5文件的写入非常简单,我们只需为标签指定存储到label位置,数据存储到data位置,就可以被接收了,注意label是二维而data是四维,就像这样:
def save_data_as_hdf5(hdf5_data_filename, data):
'''
HDF5 is one of the data formats Caffe accepts
'''
with h5py.File(hdf5_data_filename, 'w') as f:
f['data'] = data['input'].astype(np.float32)
f['label'] = data['output'].astype(np.float32)
博主由于使用需求比较特别,在windows平台采集数据,存储为npy文件,然后在linux中加载npy文件,将其转储为hdf5文件,下面给出博主的源码:
#!/usr/bin/env python
import numpy as np
import h5py
def save_data_as_hdf5(hdf5_data_filename,np_filename):
'''
HDF5 is one of the data formats Caffe accepts
example:
def save_data_as_hdf5(hdf5_data_filename, data)
with h5py.File(hdf5_data_filename, 'w') as f:
f['data'] = data['input'].astype(np.float32)
f['label'] = data['output'].astype(np.float32)
'''
with h5py.File(hdf5_data_filename, 'w') as f:
f['label'] = np.load(np_filename)
def main():
save_data_as_hdf5("train_label.hdf5","/usr/database/train_labels.npy")
save_data_as_hdf5("test_label.hdf5","/usr/database/test_labels.npy")
if __name__ == '__main__':
main()
是不是感觉非常简单呀~
OK, See You Next Chapter!