diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 8e2637b0658..7ccb7a6e805 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -4,7 +4,6 @@ #include #include #include - #include "boost/scoped_ptr.hpp" #include "hdf5.h" #include "leveldb/db.h" @@ -17,6 +16,10 @@ #include "caffe/internal_thread.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" +#include +#include +#include +#include namespace caffe { @@ -272,6 +275,44 @@ class ImageDataLayer : public BasePrefetchingDataLayer { int lines_id_; }; + +/** + * @brief Provides data to the Net from video files. + * + * TODO(dox): thorough documentation for Forward and proto params. + */ +template +class VideoDataLayer : public BasePrefetchingDataLayer { + public: + explicit VideoDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) {} + virtual ~VideoDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + vector*>* top); + + virtual inline LayerParameter_LayerType type() const { + return LayerParameter_LayerType_VIDEO_DATA; + } + virtual inline int ExactNumBottomBlobs() const { return 0; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + protected: + shared_ptr prefetch_rng_; + virtual void ShuffleBatches(); + virtual void InternalThreadEntry(); + bool ReadVideoFrameToDatum(const string& filename, size_t id, + size_t persp, const int height, const int width, Datum* datum); + +//inline bool ReadVideoBatchToDatum(const string& filename, std::vector frameIds, +// std::vectortrans, Datum* datum) { + // return ReadVideoBatchToDatum(filename, frameIds, trans, 0, 0, datum); +//} + + vector, std::vector > > > lines_; + int lines_id_; + cv::VideoCapture* cap; +}; + /** * @brief Provides data to the Net from memory. * diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 7edd1a4149c..a27f2528a0f 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -105,6 +105,9 @@ inline bool ReadImageToDatum(const string& filename, const int label, return ReadImageToDatum(filename, label, 0, 0, datum); } + + + leveldb::Options GetLevelDBOptions(); template diff --git a/models/brody/train_val_brody.prototxt b/models/brody/train_val_brody.prototxt index 23c08d1206f..b70ea376e0e 100644 --- a/models/brody/train_val_brody.prototxt +++ b/models/brody/train_val_brody.prototxt @@ -3,12 +3,11 @@ name: "BrodyNet" # Training input. layers { name: "data" - type: DATA + type: VIDEO_DATA top: "data" - data_param { - source: "/deep/group/driving_data/twangcat/lmdb/driving_img_train" - backend: LMDB - batch_size: 5 + video_data_param { + source: "/scail/group/deeplearning/driving_data/twangcat/schedules/q50_multilane_planar_train_schedule1_batch20_2cam.txt" + batch_size: 20 } transform_param { mean_file: "driving_img_mean.binaryproto" @@ -24,7 +23,7 @@ layers { data_param { source: "/deep/group/driving_data/twangcat/lmdb/driving_label_train" backend: LMDB - batch_size: 5 + batch_size: 20 } include: { phase: TRAIN } } @@ -37,7 +36,7 @@ layers { data_param { source: "/deep/group/driving_data/twangcat/lmdb/driving_img_test" backend: LMDB - batch_size: 5 + batch_size: 20 } transform_param { mean_file: "driving_img_mean.binaryproto" @@ -53,7 +52,7 @@ layers { data_param { source: "/deep/group/driving_data/twangcat/lmdb/driving_label_test" backend: LMDB - batch_size: 5 + batch_size: 20 } include: { phase: TEST } } diff --git a/src/caffe/.__afs6319 b/src/caffe/.__afs6319 index 2365e5d4e5d..65eafff2724 100644 Binary files a/src/caffe/.__afs6319 and b/src/caffe/.__afs6319 differ diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 7150fd99c18..452e4c4a31a 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -108,4 +108,4 @@ unsigned int DataTransformer::Rand() { INSTANTIATE_CLASS(DataTransformer); -} // namespace caffe +} // namespace caffe \ No newline at end of file diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index b78167f21eb..6002bd07f0e 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -1,5 +1,4 @@ #include - #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/vision_layers.hpp" @@ -249,6 +248,8 @@ Layer* GetLayer(const LayerParameter& param) { return new SplitLayer(param); case LayerParameter_LayerType_TANH: return GetTanHLayer(name, param); + case LayerParameter_LayerType_VIDEO_DATA: + return new VideoDataLayer(param); case LayerParameter_LayerType_WINDOW_DATA: return new WindowDataLayer(param); case LayerParameter_LayerType_NONE: diff --git a/src/caffe/layers/video_data_layer.cpp b/src/caffe/layers/video_data_layer.cpp new file mode 100644 index 00000000000..ac814fbbe0f --- /dev/null +++ b/src/caffe/layers/video_data_layer.cpp @@ -0,0 +1,232 @@ +#include // NOLINT(readability/streams) +#include // NOLINT(readability/streams) +#include +#include +#include +#include +#include "caffe/data_layers.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/io.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/rng.hpp" +#include +namespace caffe { + +template +VideoDataLayer::~VideoDataLayer() { + this->JoinPrefetchThread(); +} + +template +bool VideoDataLayer:: ReadVideoFrameToDatum(const string& filename, size_t id, size_t persp, + const int height, const int width, Datum* datum) { + int cam_num = (int)(filename.at(filename.length()-5) - '0'); + cam_num = cam_num>2?2:cam_num; // 3rd cam is for testing only. So using cam2 distortions as dummy + //int numPersp = mTransforms.size()/2; + cv::Mat cv_img, cv_img_origin; + bool set_ok = this->cap->set(CV_CAP_PROP_POS_FRAMES, id ); + if(!set_ok) { + LOG(ERROR)<<"Failed to set video frame"; + return false; + } + bool read_ok = this->cap->read(cv_img_origin); + if(!read_ok) { + LOG(ERROR)<<"Failed to read video frame"; + return false; + } + // resize image if necessary + if (height > 0 && width > 0) { + cv::resize(cv_img_origin, cv_img, cv::Size(width, height)); + } else { + cv_img = cv_img_origin; + } + // apply perspective transform + //cv::Mat warpMatrix = mTransforms[persp+(cam_num-1)*numPersp]; + //cv::warpPerspective(cv_img, cv_img, warpMatrix, frame.size(), cv::INTER_LINEAR, cv::BORDER_REPLICATE); + // copy data to datum + int num_channels = 3; + datum->set_channels(num_channels); + datum->set_height(cv_img.rows); + datum->set_width(cv_img.cols); + datum->set_label(0); // dummy label for now. + datum->clear_data(); + datum->clear_float_data(); + string* datum_string = datum->mutable_data(); + for (int c = 0; c < num_channels; ++c) { + for (int h = 0; h < cv_img.rows; ++h) { + for (int w = 0; w < cv_img.cols; ++w) { + datum_string->push_back( + static_cast(cv_img.at(h, w)[c])); + } + } + } + return true; +} + +template +void VideoDataLayer::DataLayerSetUp(const vector*>& bottom, + vector*>* top) { + const int new_height = this->layer_param_.video_data_param().new_height(); + const int new_width = this->layer_param_.video_data_param().new_width(); + CHECK((new_height == 0 && new_width == 0) || + (new_height > 0 && new_width > 0)) << "Current implementation requires " + "new_height and new_width to be set at the same time."; + // Read the file with filenames and labels + const string& source = this->layer_param_.video_data_param().source(); + LOG(INFO) << "Opening schedule file " << source; + std::ifstream infile(source.c_str()); + + string batch_string; + + string filename; + //while (infile >> batch_string) { + while (getline (infile, batch_string)) { + if(!infile) + { + if(infile.eof()) + { + LOG(INFO) << "Reached EOF of schedule file."; + break; + } + else + LOG(FATAL)<< "Error while reading schedule file. Possibly corrupted."; + } + std::vector batch_fields; + // first split a line into fields with delimiter ",". Fields should be [filename, frame_ids, transform_ids] + boost::split(batch_fields, batch_string, boost::is_any_of(","), + boost::token_compress_on); + if(batch_fields.size()!=3) + LOG(FATAL) << "Each line must have 3 fields separated by comma, " + < frame_ids_str; + std::vector frame_ids; + boost::split(frame_ids_str, batch_fields[1], boost::is_any_of(" "), + boost::token_compress_on); + for (int f=0; f trans_ids_str; + std::vector trans_ids; + boost::split(trans_ids_str, batch_fields[2], boost::is_any_of(" "), + boost::token_compress_on); + for (int f=0; flayer_param_.video_data_param().shuffle()) { + // randomly shuffle data + LOG(INFO) << "Shuffling batches"; + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + ShuffleBatches(); + } + LOG(INFO) << "A total of " << lines_.size() << " batches."; + + lines_id_ = 0; + // Check if we would need to randomly skip a few data points + if (this->layer_param_.video_data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() % + this->layer_param_.video_data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; + lines_id_ = skip; + } + // Read a data batch, and use it to initialize the top blob. + this->cap = new cv::VideoCapture(lines_[lines_id_].first); + Datum datum; + CHECK(ReadVideoFrameToDatum(lines_[lines_id_].first, lines_[lines_id_].second.first[0], + lines_[lines_id_].second.second[0], new_height, new_width, &datum)); + this->cap->release(); + // image + const int crop_size = this->layer_param_.transform_param().crop_size(); + const int batch_size = this->layer_param_.video_data_param().batch_size(); + if (crop_size > 0) { + (*top)[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size); + this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size, + crop_size); + } else { + (*top)[0]->Reshape(batch_size, datum.channels(), datum.height(), + datum.width()); + this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(), + datum.width()); + } + LOG(INFO) << "output data size: " << (*top)[0]->num() << "," + << (*top)[0]->channels() << "," << (*top)[0]->height() << "," + << (*top)[0]->width(); + // label + //(*top)[1]->Reshape(batch_size, 1, 1, 1); + this->prefetch_label_.Reshape(batch_size, 1, 1, 1); + // datum size + this->datum_channels_ = datum.channels(); + this->datum_height_ = datum.height(); + this->datum_width_ = datum.width(); + this->datum_size_ = datum.channels() * datum.height() * datum.width(); +} + +template +void VideoDataLayer::ShuffleBatches() { + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + shuffle(lines_.begin(), lines_.end(), prefetch_rng); +} + + + +// This function is used to create a thread that prefetches the data. +template +void VideoDataLayer::InternalThreadEntry() { + Datum datum; + CHECK(this->prefetch_data_.count()); + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + //Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); + VideoDataParameter video_data_param = this->layer_param_.video_data_param(); + const int batch_size = video_data_param.batch_size(); + const int new_height = video_data_param.new_height(); + const int new_width = video_data_param.new_width(); + + // datum scales + const int lines_size = lines_.size(); + string filename = lines_[lines_id_].first; + std::vector frameIds = lines_[lines_id_].second.first; + std::vector trans = lines_[lines_id_].second.second; + if (batch_size!=frameIds.size() || batch_size!=trans.size()) + LOG(ERROR)<<"Frame count mismatch!"; + LOG(INFO)<<"reading video file "<cap = new cv::VideoCapture(filename); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a blob + //CHECK_GT(lines_size, lines_id_); + if (!ReadVideoFrameToDatum(filename, frameIds[item_id], trans[item_id], + new_height, new_width, &datum)) { + LOG(ERROR)<< "Error reading frame from video!"; + continue; + } + + // Apply transformations (mirror, crop...) to the data + this->data_transformer_.Transform(item_id, datum, this->mean_, top_data); + + // go to the next iter + lines_id_++; + if (lines_id_ >= lines_size) { + // We have reached the end. Restart from the first. + DLOG(INFO) << "Restarting data prefetching from start."; + lines_id_ = 0; + if (this->layer_param_.image_data_param().shuffle()) { + ShuffleBatches(); + } + } + } + this->cap->release(); +} + +INSTANTIATE_CLASS(VideoDataLayer); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 9395c38f3e9..0e69ef54e77 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -219,7 +219,7 @@ message LayerParameter { // line above the enum. Update the next available ID when you add a new // LayerType. // - // LayerType next available ID: 38 (last added: CONTRASTIVE_LOSS) + // LayerType next available ID: 39 (last added: VIDEO_DATA) enum LayerType { // "NONE" layer type is 0th enum element so that we don't cause confusion // by defaulting to an existent LayerType (instead, should usually error if @@ -260,8 +260,9 @@ message LayerParameter { SPLIT = 22; SLICE = 33; TANH = 23; - WINDOW_DATA = 24; THRESHOLD = 31; + VIDEO_DATA=38; + WINDOW_DATA = 24; } optional LayerType type = 5; // the layer type from the enum above @@ -316,6 +317,7 @@ message LayerParameter { optional SliceParameter slice_param = 31; optional TanHParameter tanh_param = 37; optional ThresholdParameter threshold_param = 25; + optional VideoDataParameter video_data_param = 41; optional WindowDataParameter window_data_param = 20; // Parameters for data pre-processing. @@ -524,6 +526,35 @@ message ImageDataParameter { optional bool mirror = 6 [default = false]; } +// Message that stores parameters used by VideoDataLayer +message VideoDataParameter { + // Specify the data source. + optional string source = 1; + // Specify the batch size. + optional uint32 batch_size = 4; + // The rand_skip variable is for the data layer to skip a few data points + // to avoid all asynchronous sgd clients to start at the same point. The skip + // point would be set as rand_skip * rand(0,1). Note that rand_skip should not + // be larger than the number of keys in the leveldb. + optional uint32 rand_skip = 7 [default = 0]; + // Whether or not VideoDataLayer should shuffle the list of files at every epoch. + optional bool shuffle = 8 [default = false]; + // It will also resize images if new_height or new_width are not zero. + optional uint32 new_height = 9 [default = 0]; + optional uint32 new_width = 10 [default = 0]; + // DEPRECATED. See TransformationParameter. For data pre-processing, we can do + // simple scaling and subtracting the data mean, if provided. Note that the + // mean subtraction is always carried out before scaling. + optional float scale = 2 [default = 1]; + optional string mean_file = 3; + // DEPRECATED. See TransformationParameter. Specify if we would like to randomly + // crop an image. + optional uint32 crop_size = 5 [default = 0]; + // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror + // data. + optional bool mirror = 6 [default = false]; +} + // Message that stores parameters InfogainLossLayer message InfogainLossParameter { // Specify the infogain matrix source. diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 26bab64b2ca..aa1dd9e137e 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -207,52 +207,89 @@ void Solver::Solve(const char* resume_file) { string str2("label"); string str3("pixel-label"); string str4("bb-label"); + string save_dir("/scr/twangcat/caffenet_results/train/"); + vector save_imgs; + int quad_height; + int quad_width; + int batch_size; + const Dtype* pix_start; + const Dtype* bb_start; for (int j = 0; j < blobs.size(); ++j) { - if(blob_names[j].compare(str1)==0) + if(blob_names[j].compare(str3)==0) //pixel label + { + LOG(INFO) << "pixel-label " << blobs[j]->num()<<" "<channels()<<" "<height()<<" "<width(); + pix_start = blobs[j]->cpu_data(); + quad_height = blobs[j]->height(); + quad_width = blobs[j]->width(); + batch_size = blobs[j]->num(); + } + if(blob_names[j].compare(str4)==0) // bb label + { + LOG(INFO) << "bb-label " << blobs[j]->num()<<" "<channels()<<" "<height()<<" "<width(); + bb_start = blobs[j]->cpu_data(); + } + if(blob_names[j].compare(str1)==0) // actual image { LOG(INFO) << "data " << blobs[j]->num()<<" "<channels()<<" "<height()<<" "<width(); + const Dtype* data_start = blobs[j]->cpu_data(); for(int n=0; nnum(); ++n) { - int image_id = this->iter_*5+n; - const Dtype* foo; - foo = blobs[j]->cpu_data()+n*blobs[j]->channels()*blobs[j]->height()*blobs[j]->width(); cv::Mat curr_img = cv::Mat(blobs[j]->height(), blobs[j]->width(), CV_32FC3, cv::Scalar(0,0,255)); - double minVal, maxVal; - cv::minMaxLoc(curr_img, &minVal, &maxVal); //find minimum and maximum intensities - //std::copy ( foo, foo+blobs[j]->channels()*blobs[j]->height()*blobs[j]->width(), curr_img.data ); for(int kk=0; kkchannels();++kk) { for(int yy=0; yyheight();++yy) { for(int xx=0; xxwidth();++xx) { - //std::cout<<*(foo+(((n*blobs[j]->channels() + kk) * blobs[j]->height() + yy) * blobs[j]->width() + xx))<<" "; - //*(curr_img.data+((yy * blobs[j]->width() + xx) * 3 + kk))=*(foo+(((n*blobs[j]->channels() + kk) * blobs[j]->height() + yy) * blobs[j]->width() + xx)); - curr_img.at(yy,xx)[kk]=*(foo+(((n*blobs[j]->channels() + kk) * blobs[j]->height() + yy) * blobs[j]->width() + xx)); + curr_img.at(yy,xx)[kk]=*(data_start+(((n*blobs[j]->channels() + kk) * blobs[j]->height() + yy) * blobs[j]->width() + xx)); } } } - //std::cout<num()<<" "<channels()<<" "<height()<<" "<width(); - } - if(blob_names[j].compare(str3)==0) - { - LOG(INFO) << "pixel-label " << blobs[j]->num()<<" "<channels()<<" "<height()<<" "<width(); + } + int grid_dim=4; + int label_count = 0; + int label_height = quad_height*grid_dim; + int label_width = quad_width*grid_dim; + Dtype scaling = 1.0/8; + for(int n=0; niter_*5+n; + cv::Mat save_img = save_imgs[n]; + std::ostringstream stringStream; + stringStream <(y/scaling,x/scaling) = cv::Vec3f(0,255,0); + save_img.at(y/scaling-1,x/scaling-1) = cv::Vec3f(0,255,0); + save_img.at(y/scaling+1,x/scaling-1) = cv::Vec3f(0,255,0); + save_img.at(y/scaling-1,x/scaling+1) = cv::Vec3f(0,255,0); + save_img.at(y/scaling+1,x/scaling+1) = cv::Vec3f(0,255,0); + + float x_adj = (qx*grid_dim + grid_dim / 2) / scaling; + float y_adj = (qy*grid_dim + grid_dim / 2) / scaling; + int x_min = *(bb_start+(((n*64+z)*quad_height+qy)*quad_width+qx))+x_adj; + int y_min = *(bb_start+(((n*64+z+16)*quad_height+qy)*quad_width+qx))+y_adj; + int x_max = *(bb_start+(((n*64+z+32)*quad_height+qy)*quad_width+qx))+x_adj; + int y_max = *(bb_start+(((n*64+z+48)*quad_height+qy)*quad_width+qx))+y_adj; + cv::Rect bb(x_min, y_min, x_max-x_min+1, y_max-y_min+1); + cv::rectangle(save_img, bb, cv::Scalar(100, 100, 200), 2); + } + } + } + cv::imwrite(save_name, save_img); } - if(blob_names[j].compare(str4)==0) - { - LOG(INFO) << "bb-label " << blobs[j]->num()<<" "<channels()<<" "<height()<<" "<width(); - }*/ } //end int score_index = 0; @@ -321,6 +358,7 @@ void Solver::Test(const int test_net_id) { const shared_ptr >& test_net = test_nets_[test_net_id]; Dtype loss = 0; for (int i = 0; i < param_.test_iter(test_net_id); ++i) { + LOG(INFO) << "i = " << i<<" of "<*>& result = test_net->Forward(bottom_vec, &iter_loss); @@ -350,6 +388,7 @@ void Solver::Test(const int test_net_id) { LOG(INFO) << "Test loss: " << loss; } for (int i = 0; i < test_score.size(); ++i) { + LOG(INFO) << "i2 = " << i<<" of "<output_blob_indices()[test_score_output_id[i]]; const string& output_name = test_net->blob_names()[output_blob_index]; diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index cf4ba8cffa4..991ef91491d 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -166,6 +166,16 @@ bool ReadImageToDatum(const string& filename, const int label, return true; } +//added by Tao + + + + + +//end + + + leveldb::Options GetLevelDBOptions() { // In default, we will return the leveldb option and set the max open files // in order to avoid using up the operating system's limit.