added video data loader. Now need to add lane label reader

brodyh · Nov 15, 2014 · ff97eab · ff97eab
1 parent 66c77d5
commit ff97eab
Show file tree

Hide file tree

Showing 10 changed files with 397 additions and 41 deletions.
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
@@ -4,7 +4,6 @@
 #include <string>
 #include <utility>
 #include <vector>
-
 #include "boost/scoped_ptr.hpp"
 #include "hdf5.h"
 #include "leveldb/db.h"
@@ -17,6 +16,10 @@
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <opencv2/imgproc/imgproc.hpp>
 
 namespace caffe {
 
@@ -272,6 +275,44 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
   int lines_id_;
 };
 
+
+/**
+ * @brief Provides data to the Net from video files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class VideoDataLayer : public BasePrefetchingDataLayer<Dtype> {
+ public:
+  explicit VideoDataLayer(const LayerParameter& param)
+      : BasePrefetchingDataLayer<Dtype>(param) {}
+  virtual ~VideoDataLayer();
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_VIDEO_DATA;
+  }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  shared_ptr<Caffe::RNG> prefetch_rng_;
+  virtual void ShuffleBatches();
+  virtual void InternalThreadEntry();
+  bool ReadVideoFrameToDatum(const string& filename, size_t id,
+    size_t persp, const int height, const int width, Datum* datum);
+
+//inline bool ReadVideoBatchToDatum(const string& filename, std::vector<size_t> frameIds,
+//    std::vector<size_t>trans, Datum* datum) {
+ // return ReadVideoBatchToDatum(filename, frameIds, trans, 0, 0, datum);
+//}
+
+  vector<std::pair<std::string, std::pair<std::vector<size_t>, std::vector<size_t> > > > lines_;
+  int lines_id_;
+  cv::VideoCapture* cap;
+};
+
 /**
  * @brief Provides data to the Net from memory.
  *

diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
@@ -105,6 +105,9 @@ inline bool ReadImageToDatum(const string& filename, const int label,
   return ReadImageToDatum(filename, label, 0, 0, datum);
 }
 
+
+
+
 leveldb::Options GetLevelDBOptions();
 
 template <typename Dtype>

diff --git a/models/brody/train_val_brody.prototxt b/models/brody/train_val_brody.prototxt
@@ -3,12 +3,11 @@ name: "BrodyNet"
 # Training input.
 layers {
   name: "data"
-  type: DATA
+  type: VIDEO_DATA
   top: "data"
-  data_param {
-    source: "/deep/group/driving_data/twangcat/lmdb/driving_img_train"
-    backend: LMDB
-    batch_size: 5
+  video_data_param {
+    source: "/scail/group/deeplearning/driving_data/twangcat/schedules/q50_multilane_planar_train_schedule1_batch20_2cam.txt"
+    batch_size: 20
   }
   transform_param {
     mean_file: "driving_img_mean.binaryproto"
@@ -24,7 +23,7 @@ layers {
   data_param {
     source: "/deep/group/driving_data/twangcat/lmdb/driving_label_train"
     backend: LMDB
-    batch_size: 5
+    batch_size: 20
   }
   include: { phase: TRAIN }
 }
@@ -37,7 +36,7 @@ layers {
   data_param {
     source: "/deep/group/driving_data/twangcat/lmdb/driving_img_test"
     backend: LMDB
-    batch_size: 5
+    batch_size: 20
   }
   transform_param {
     mean_file: "driving_img_mean.binaryproto"
@@ -53,7 +52,7 @@ layers {
   data_param {
     source: "/deep/group/driving_data/twangcat/lmdb/driving_label_test"
     backend: LMDB
-    batch_size: 5
+    batch_size: 20
   }
   include: { phase: TEST }
 }

diff --git a/src/caffe/.__afs6319 b/src/caffe/.__afs6319
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
@@ -108,4 +108,4 @@ unsigned int DataTransformer<Dtype>::Rand() {
 
 INSTANTIATE_CLASS(DataTransformer);
 
-}  // namespace caffe
+}  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
@@ -1,5 +1,4 @@
 #include <string>
-
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/vision_layers.hpp"
@@ -249,6 +248,8 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
     return new SplitLayer<Dtype>(param);
   case LayerParameter_LayerType_TANH:
     return GetTanHLayer<Dtype>(name, param);
+  case LayerParameter_LayerType_VIDEO_DATA:
+    return new VideoDataLayer<Dtype>(param);
   case LayerParameter_LayerType_WINDOW_DATA:
     return new WindowDataLayer<Dtype>(param);
   case LayerParameter_LayerType_NONE:

diff --git a/src/caffe/layers/video_data_layer.cpp b/src/caffe/layers/video_data_layer.cpp
@@ -0,0 +1,232 @@
+#include <fstream>  // NOLINT(readability/streams)
+#include <iostream>  // NOLINT(readability/streams)
+#include <string>
+#include <utility>
+#include <vector>
+#include <stdlib.h>
+#include "caffe/data_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
+#include <boost/algorithm/string.hpp>
+namespace caffe {
+
+template <typename Dtype>
+VideoDataLayer<Dtype>::~VideoDataLayer<Dtype>() {
+  this->JoinPrefetchThread();
+}
+
+template <typename Dtype>
+bool VideoDataLayer<Dtype>:: ReadVideoFrameToDatum(const string& filename, size_t id, size_t persp,
+    const int height, const int width, Datum* datum) {
+  int cam_num = (int)(filename.at(filename.length()-5) - '0');
+  cam_num = cam_num>2?2:cam_num; // 3rd cam is for testing only. So using cam2 distortions as dummy
+  //int numPersp = mTransforms.size()/2;
+  cv::Mat cv_img, cv_img_origin;
+  bool set_ok = this->cap->set(CV_CAP_PROP_POS_FRAMES, id );
+  if(!set_ok) {
+    LOG(ERROR)<<"Failed to set video frame"; 
+    return false;
+  }
+  bool read_ok = this->cap->read(cv_img_origin);
+  if(!read_ok) {
+    LOG(ERROR)<<"Failed to read video frame";
+    return false;
+  }
+  // resize image if necessary
+  if (height > 0 && width > 0) {
+    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
+  } else {
+    cv_img = cv_img_origin;
+  }
+  // apply perspective transform
+  //cv::Mat warpMatrix = mTransforms[persp+(cam_num-1)*numPersp];
+  //cv::warpPerspective(cv_img, cv_img, warpMatrix, frame.size(), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
+  // copy data to datum  
+  int num_channels = 3;
+  datum->set_channels(num_channels);
+  datum->set_height(cv_img.rows);
+  datum->set_width(cv_img.cols);
+  datum->set_label(0); // dummy label for now.
+  datum->clear_data();
+  datum->clear_float_data();
+  string* datum_string = datum->mutable_data();
+  for (int c = 0; c < num_channels; ++c) {
+    for (int h = 0; h < cv_img.rows; ++h) {
+      for (int w = 0; w < cv_img.cols; ++w) {
+        datum_string->push_back(
+          static_cast<char>(cv_img.at<cv::Vec3b>(h, w)[c]));
+      }
+    }
+  }
+  return true;
+}
+
+template <typename Dtype>
+void VideoDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const int new_height = this->layer_param_.video_data_param().new_height();
+  const int new_width  = this->layer_param_.video_data_param().new_width();
+  CHECK((new_height == 0 && new_width == 0) ||
+      (new_height > 0 && new_width > 0)) << "Current implementation requires "
+      "new_height and new_width to be set at the same time.";
+  // Read the file with filenames and labels
+  const string& source = this->layer_param_.video_data_param().source();
+  LOG(INFO) << "Opening schedule file " << source;
+  std::ifstream infile(source.c_str());
+
+  string batch_string;
+
+  string filename;
+  //while (infile >> batch_string) {
+  while (getline (infile, batch_string)) {
+    if(!infile)
+    {
+      if(infile.eof())
+      {
+        LOG(INFO) << "Reached EOF of schedule file.";
+        break;
+      }
+      else
+        LOG(FATAL)<< "Error while reading schedule file. Possibly corrupted.";
+    }
+    std::vector<string> batch_fields;
+    // first split a line into fields with delimiter ",". Fields should be [filename, frame_ids, transform_ids]
+    boost::split(batch_fields, batch_string, boost::is_any_of(","), 
+                 boost::token_compress_on);
+    if(batch_fields.size()!=3)
+      LOG(FATAL) << "Each line must have 3 fields separated by comma, "
+                 <<batch_fields.size()<<" found instead";
+    // store filename
+    filename = batch_fields[0];
+    // store frame ids
+    std::vector<string> frame_ids_str;
+    std::vector<size_t> frame_ids;
+    boost::split(frame_ids_str, batch_fields[1], boost::is_any_of(" "), 
+                 boost::token_compress_on);
+    for (int f=0; f<frame_ids_str.size(); ++f) {
+      frame_ids.push_back((size_t)atoi(frame_ids_str[f].c_str()));
+    }
+
+    // store persp transform ids
+    std::vector<string> trans_ids_str;
+    std::vector<size_t> trans_ids;
+    boost::split(trans_ids_str, batch_fields[2], boost::is_any_of(" "), 
+                 boost::token_compress_on);
+    for (int f=0; f<trans_ids_str.size(); ++f) {
+      trans_ids.push_back((size_t)atoi(trans_ids_str[f].c_str()));
+    }
+
+    lines_.push_back(std::make_pair(filename, std::make_pair(frame_ids, trans_ids)));
+  }
+
+  if (this->layer_param_.video_data_param().shuffle()) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling batches";
+    const unsigned int prefetch_rng_seed = caffe_rng_rand();
+    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+    ShuffleBatches();
+  }
+  LOG(INFO) << "A total of " << lines_.size() << " batches.";
+
+  lines_id_ = 0;
+  // Check if we would need to randomly skip a few data points
+  if (this->layer_param_.video_data_param().rand_skip()) {
+    unsigned int skip = caffe_rng_rand() %
+        this->layer_param_.video_data_param().rand_skip();
+    LOG(INFO) << "Skipping first " << skip << " data points.";
+    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
+    lines_id_ = skip;
+  }
+  // Read a data batch, and use it to initialize the top blob.
+  this->cap = new cv::VideoCapture(lines_[lines_id_].first);
+  Datum datum;
+  CHECK(ReadVideoFrameToDatum(lines_[lines_id_].first, lines_[lines_id_].second.first[0],
+                         lines_[lines_id_].second.second[0], new_height, new_width, &datum)); 
+  this->cap->release();
+  // image
+  const int crop_size = this->layer_param_.transform_param().crop_size();
+  const int batch_size = this->layer_param_.video_data_param().batch_size();
+  if (crop_size > 0) {
+    (*top)[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
+    this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size,
+                                 crop_size);
+  } else {
+    (*top)[0]->Reshape(batch_size, datum.channels(), datum.height(),
+                       datum.width());
+    this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(),
+        datum.width());
+  }
+  LOG(INFO) << "output data size: " << (*top)[0]->num() << ","
+      << (*top)[0]->channels() << "," << (*top)[0]->height() << ","
+      << (*top)[0]->width();
+  // label
+  //(*top)[1]->Reshape(batch_size, 1, 1, 1);
+  this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
+  // datum size
+  this->datum_channels_ = datum.channels();
+  this->datum_height_ = datum.height();
+  this->datum_width_ = datum.width();
+  this->datum_size_ = datum.channels() * datum.height() * datum.width();
+}
+
+template <typename Dtype>
+void VideoDataLayer<Dtype>::ShuffleBatches() {
+  caffe::rng_t* prefetch_rng =
+      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
+}
+
+
+
+// This function is used to create a thread that prefetches the data.
+template <typename Dtype>
+void VideoDataLayer<Dtype>::InternalThreadEntry() {
+  Datum datum;
+  CHECK(this->prefetch_data_.count());
+  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+  //Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
+  VideoDataParameter video_data_param = this->layer_param_.video_data_param();
+  const int batch_size = video_data_param.batch_size();
+  const int new_height = video_data_param.new_height();
+  const int new_width = video_data_param.new_width();
+
+  // datum scales
+  const int lines_size = lines_.size();
+  string filename = lines_[lines_id_].first;
+  std::vector<size_t> frameIds = lines_[lines_id_].second.first;
+  std::vector<size_t> trans = lines_[lines_id_].second.second;
+  if (batch_size!=frameIds.size() || batch_size!=trans.size())
+    LOG(ERROR)<<"Frame count mismatch!";
+  LOG(INFO)<<"reading video file "<<filename;
+  this->cap = new cv::VideoCapture(filename);
+  for (int item_id = 0; item_id < batch_size; ++item_id) {
+    // get a blob
+    //CHECK_GT(lines_size, lines_id_);
+    if (!ReadVideoFrameToDatum(filename, frameIds[item_id], trans[item_id],
+          new_height, new_width, &datum)) {
+      LOG(ERROR)<< "Error reading frame from video!";
+      continue;
+    }
+
+    // Apply transformations (mirror, crop...) to the data
+    this->data_transformer_.Transform(item_id, datum, this->mean_, top_data);
+
+    // go to the next iter
+    lines_id_++;
+    if (lines_id_ >= lines_size) {
+      // We have reached the end. Restart from the first.
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      lines_id_ = 0;
+      if (this->layer_param_.image_data_param().shuffle()) {
+        ShuffleBatches();
+      }
+    }
+  }
+  this->cap->release();
+}
+
+INSTANTIATE_CLASS(VideoDataLayer);
+
+}  // namespace caffe
-Original file line number
+Diff line change
@@ Expand Up @@
       return ReadImageToDatum(filename, label, 0, 0, datum);
     }
     leveldb::Options GetLevelDBOptions();
     template <typename Dtype>
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -108,4 +108,4 @@ unsigned int DataTransformer<Dtype>::Rand() {

		INSTANTIATE_CLASS(DataTransformer);

		} // namespace caffe
		} // namespace caffe