利用python生成供给Caffe的hdf5数据

caffe中需要将数据存储为lmdb、hdf5等数据库格式，caffe自带的convert_image工具可以完成lmdb数据格式的转换，但是遇到浮点数，这个工具就不行了，所以，只有存成hdf5了。对于python存储hdf5，本文分析多个demo的数据生成代码，并作出总结（主要就是贴代码了）：

首先来看几个生成hdf5的demo，（代码并不完整，这里只给出和存储数据有关部分，完整代码参见本人之前的博客）这是之前安德烈花卉的demo，这个demo中，要存储的数据data是（150,1,1,4）的四维结构，label是（150,3）的二维结构，当然了，第一维是样本，需要一一对应，我们来看存储数据的python代码：

'''

Requirements:
sudo pip install pydot
sudo apt-get install -y graphviz

Interesting resources on Caffe:
 - https://github.com/BVLC/caffe/tree/master/examples
 - http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb
 
Interesting resources on Iris with ANNs:
 - iris data set test bed: http://deeplearning4j.org/iris-flower-dataset-tutorial.html
 - http://se.mathworks.com/help/nnet/examples/iris-clustering.html
 - http://lab.fs.uni-lj.si/lasin/wp/IMIT_files/neural/doc/seminar8.pdf
 
Synonyms:
 - output = label = target
 - input = feature 
'''

import subprocess
import platform
import copy

from sklearn.datasets import load_iris
import sklearn.metrics 
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import h5py
import caffe
import caffe.draw


def load_data():
    '''
    Load Iris Data set
    '''
    data = load_iris()
    print(data.data)
    print(data.target)
    targets = np.zeros((len(data.target), 3))
    for count, target in enumerate(data.target):
        targets[count][target]= 1    
    print(targets)
    
    new_data = {}
    #new_data['input'] = data.data
    new_data['input'] = np.reshape(data.data, (150,1,1,4))
    new_data['output'] = targets
    #print(new_data['input'].shape)
    #new_data['input'] = np.random.random((150, 1, 1, 4))
    #print(new_data['input'].shape)   
    #new_data['output'] = np.random.random_integers(0, 1, size=(150,3))    
    #print(new_data['input'])
    
    return new_data

def save_data_as_hdf5(hdf5_data_filename, data):
    '''
    HDF5 is one of the data formats Caffe accepts
    '''
    with h5py.File(hdf5_data_filename, 'w') as f:
        f['data'] = data['input'].astype(np.float32)
        f['label'] = data['output'].astype(np.float32)
    
def main():
    '''
    This is the main function
    '''
    
    # Set parameters
    solver_prototxt_filename = 'iris_solver.prototxt'
    train_test_prototxt_filename = 'iris_train_test.prototxt'
    deploy_prototxt_filename  = 'iris_deploy.prototxt'
    deploy_prototxt_filename  = 'iris_deploy.prototxt'
    deploy_prototxt_batch2_filename  = 'iris_deploy_batchsize2.prototxt'
    hdf5_train_data_filename = 'iris_train_data.hdf5' 
    hdf5_test_data_filename = 'iris_test_data.hdf5' 
    caffemodel_filename = 'iris__iter_1000000.caffemodel' # generated by train()
    
    # Prepare data
    data = load_data()
    print(data)
    train_data = data
    test_data = data
    save_data_as_hdf5(hdf5_train_data_filename, data)
    save_data_as_hdf5(hdf5_test_data_filename, data)
    
    # Train network
    train(solver_prototxt_filename)
        
    # Get predicted outputs
    input = np.array([[ 5.1,  3.5,  1.4,  0.2]])
    print(get_predicted_output(deploy_prototxt_filename, caffemodel_filename, input))
    input = np.array([[[[ 5.1,  3.5,  1.4,  0.2]]],[[[ 5.9,  3. ,  5.1,  1.8]]]])
    #print(get_predicted_output(deploy_prototxt_batch2_filename, caffemodel_filename, input))
    
    # Print network
    print_network(deploy_prototxt_filename, caffemodel_filename)
    print_network(train_test_prototxt_filename, caffemodel_filename)
    print_network_weights(train_test_prototxt_filename, caffemodel_filename)
    
    # Compute performance metrics
    #inputs = input = np.array([[[[ 5.1,  3.5,  1.4,  0.2]]],[[[ 5.9,  3. ,  5.1,  1.8]]]])
    inputs = data['input']
    outputs = get_predicted_outputs(deploy_prototxt_filename, caffemodel_filename, inputs)
    get_accuracy(data['output'], outputs)
    
    
if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling

上面的是浮点型data，类别数组型label接下来，我们看一个，和人脸特征点检测有关的多标签数据hdf5生成工具，下面这段代码同样不完整，我们就来看一看：

import numpy as np
import pandas as pd
from numpy import genfromtxt
from numpy import ravel
import pylab as pl
from skimage import transform
import h5py
from sklearn import cross_validation
import uuid
import random
from skimage import io, exposure, img_as_uint, img_as_float
from numpy import (array, dot, arccos)
from numpy.linalg import norm

#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,labels, test_size=0.30)

X_train = X[:1600]
y_train = y[:1600]
X_test = X[1600:]
y_test = y[1600:]

print 'Train, Test shapes (X,y):', X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Train data
f = h5py.File("facialkp-train.hd5", "w")
f.create_dataset("data", data=X_train,  compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_train,  compression="gzip", compression_opts=4)
f.close()

#Test data

f = h5py.File("facialkp-test.hd5", "w")
f.create_dataset("data", data=X_test,  compression="gzip", compression_opts=4)
f.create_dataset("label", data=y_test,  compression="gzip", compression_opts=4)
f.close()

到这里，我们大概就明白了，python中hdf5文件的写入非常简单，我们只需为标签指定存储到label位置，数据存储到data位置，就可以被接收了，注意label是二维而data是四维，就像这样：

def save_data_as_hdf5(hdf5_data_filename, data):
    '''
    HDF5 is one of the data formats Caffe accepts
    '''
    with h5py.File(hdf5_data_filename, 'w') as f:
        f['data'] = data['input'].astype(np.float32)
        f['label'] = data['output'].astype(np.float32)

博主由于使用需求比较特别，在windows平台采集数据，存储为npy文件，然后在linux中加载npy文件，将其转储为hdf5文件，下面给出博主的源码：

#!/usr/bin/env python
import numpy as np
import h5py

def save_data_as_hdf5(hdf5_data_filename,np_filename):
    '''
    HDF5 is one of the data formats Caffe accepts
    example:
    def save_data_as_hdf5(hdf5_data_filename, data)
    with h5py.File(hdf5_data_filename, 'w') as f:
        f['data'] = data['input'].astype(np.float32)
        f['label'] = data['output'].astype(np.float32)
    '''
    
    with h5py.File(hdf5_data_filename, 'w') as f:
        f['label'] = np.load(np_filename)

def main():
    save_data_as_hdf5("train_label.hdf5","/usr/database/train_labels.npy")
    save_data_as_hdf5("test_label.hdf5","/usr/database/test_labels.npy")

if __name__ == '__main__':
    main()

是不是感觉非常简单呀～

OK, See You Next Chapter!

发表评论 取消回复

发表评论取消回复