From 0f868aa2af5d8353adffe5ea2cba25c590c16336 Mon Sep 17 00:00:00 2001 From: brodyh Date: Sun, 15 Feb 2015 19:13:41 -0800 Subject: [PATCH] Added new ipython notebook --- ...er_visualization_new_driving_softmax.ipynb | 592 ++++++++++++++++++ 1 file changed, 592 insertions(+) create mode 100644 examples/filter_visualization_new_driving_softmax.ipynb diff --git a/examples/filter_visualization_new_driving_softmax.ipynb b/examples/filter_visualization_new_driving_softmax.ipynb new file mode 100644 index 00000000000..5f29ea45db6 --- /dev/null +++ b/examples/filter_visualization_new_driving_softmax.ipynb @@ -0,0 +1,592 @@ +{ + "metadata": { + "description": "Extracting features and visualizing trained filters with an example image, viewed layer-by-layer.", + "example_name": "Filter visualization", + "include_in_docs": true, + "priority": 2, + "signature": "sha256:f891223d1e1552aa8c7362486254a27a98c554037008aeca1ca510db50a90ea7" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we visualize filters and outputs using the network architecture proposed by Krizhevsky et al. for ImageNet and implemented in `caffe`.\n", + "\n", + "(This page follows DeCAF visualizations originally by Yangqing Jia.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import required modules and set plotting parameters" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import numpy as np\n", + "import scipy\n", + "from matplotlib import cm as cm\n", + "import matplotlib.pyplot as plt\n", + "import cv2\n", + "%matplotlib inline\n", + "\n", + "# Make sure that caffe is on the python path:\n", + "caffe_root = '../' # this file is expected to be in {caffe_root}/examples\n", + "import sys\n", + "sys.path.insert(0, caffe_root + 'python')\n", + "\n", + "import caffe\n", + "\n", + "plt.rcParams['figure.figsize'] = (10, 10)\n", + "plt.rcParams['image.interpolation'] = 'nearest'\n", + "plt.rcParams['image.cmap'] = 'jet'" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run `./scripts/download_model_binary.py models/bvlc_reference_caffenet` to get the pretrained CaffeNet model, load the net, specify test phase and CPU mode, and configure input preprocessing." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "net = caffe.Classifier('/deep/u/willsong/caffe/models/brody/deploy_new_driving.prototxt',\n", + "# '/deep/u/willsong/caffe/models/brody/driving_normalization_nl_nrob_iter_90000.caffemodel')\n", + "# '/deep/u/willsong/caffe/models/brody/driving_normalization_fo_iter_267000.caffemodel')\n", + " '/deep/u/brodyh/caffe/snapshots/drivenet/driving_normalization_nl_iter_15000.caffemodel')\n", + " #'/deep/u/willsong/caffe/models/brody/new_driving_softmax_iter_300000.caffemodel')\n", + " #'/deep/u/willsong/caffe/models/brody/driving_softmax_8x8_iter_2000.caffemodel')\n", + "# '/deep/u/willsong/caffe/models/brody/caffe_brody_train_gc_iter_300000.caffemodel')\n", + " #'/deep/u/willsong/caffe/models/brody/caffe_brody_train_iter_530000.caffemodel')\n", + " \n", + "net.set_phase_test()\n", + "net.set_mode_gpu()\n", + "\n", + "# input preprocessing: 'data' is the name of the input blob == net.inputs[0]\n", + "net.set_mean('data', np.load('/deep/u/willsong/caffe/python/driving_mean_640x480_rgb.npy')) # ImageNet mean\n", + "net.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]\n", + "#net.set_channel_swap('data', (0,1,2)) # the reference model has channels in BGR order instead of RGB" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run a classification pass" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_from_gilroy_c2/4-3-14-gilroy-split_0_from_gilroy_c2_001541.jpeg')])\n", + "#net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_from_gilroy_c2/4-3-14-gilroy-split_0_from_gilroy_c2_000201.jpeg')])\n", + "#net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_from_gilroy_d2/4-3-14-gilroy-split_0_from_gilroy_d2_001841.jpeg')])\n", + "#net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_from_gilroy_d2/4-3-14-gilroy-split_0_from_gilroy_d2_001301.jpeg')])\n", + "#net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/5-1-14-monterey-split_0_from_castroville_i2/5-1-14-monterey-split_0_from_castroville_i2_000441.jpeg')])\n", + "net.ff([caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_to_gilroy_a2/4-3-14-gilroy-split_0_to_gilroy_a2_002561.jpeg')])\n", + "#input_img = caffe.io.load_image('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_to_gilroy_a2/4-3-14-gilroy-split_0_to_gilroy_a2_002561.jpeg')\n", + "#print input_img[:2,:2]*255\n", + "#input_img_cv = cv2.imread('/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-3-14-gilroy-split_0_to_gilroy_a2/4-3-14-gilroy-split_0_to_gilroy_a2_002561.jpeg')\n", + "#print input_img_cv[:2,:2]\n", + "#print input_img_cv.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The layer features and their shapes (10 is the batch size, corresponding to the the ten subcrops used by Krizhevsky et al.)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "[(k, v.data.shape) for k, v in net.blobs.items()]" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The parameters and their shapes (each of these layers also has biases which are omitted here)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "[(k, v[0].data.shape) for k, v in net.params.items()]" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Helper functions for visualization" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# take an array of shape (n, height, width) or (n, height, width, channels)\n", + "# and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)\n", + "def vis_square(data, padsize=1, padval=0):\n", + " data = np.copy(data)\n", + " data -= data.min()\n", + " data /= data.max()\n", + " \n", + " # force the number of filters to be square\n", + " n = int(np.ceil(np.sqrt(data.shape[0])))\n", + " padding = ((0, n ** 2 - data.shape[0]), (0, padsize), (0, padsize)) + ((0, 0),) * (data.ndim - 3)\n", + " data = np.pad(data, padding, mode='constant', constant_values=(padval, padval))\n", + " \n", + " # tile the filters into an image\n", + " data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))\n", + " data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])\n", + " \n", + " plt.imshow(data)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# take an array of shape (n, height, width) or (n, height, width, channels)\n", + "# and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)\n", + "def vis_square_color(data, padsize=1, padval=0): \n", + " data = np.copy(data)\n", + " # force the number of filters to be square\n", + " n = int(np.ceil(np.sqrt(data.shape[0])))\n", + " padding = ((0, n ** 2 - data.shape[0]), (0, padsize), (0, padsize)) + ((0, 0),) * (data.ndim - 3)\n", + " data = np.pad(data, padding, mode='constant', constant_values=(padval, padval))\n", + " \n", + " # tile the filters into an image\n", + " data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))\n", + " data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])\n", + " print data.shape\n", + " plt.imshow(data)\n", + " plt.colorbar()\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The input image" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# index four is the center crop\n", + "input_data = net.deprocess('data', net.blobs['data'].data[4])\n", + "print input_data.shape\n", + "plt.subplot(2, 1, 1)\n", + "plt.imshow(input_data)\n", + "plt.subplot(2, 1, 2)\n", + "mean_img_path='/deep/u/willsong/caffe/python/driving_mean_640x480_rgb.npy'\n", + "plt.imshow(np.clip(0.5+(input_data - np.load(mean_img_path).transpose([1,2,0])/255),0,1))" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first layer filters, `conv1`" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# the parameters are a list of [weights, biases]\n", + "filters = net.params['L0'][0].data\n", + "vis_square(filters.transpose(0, 2, 3, 1))\n", + "\n", + "filters = net.params['L6'][0].data\n", + "print np.linalg.norm(filters)\n", + "print np.min(filters)\n", + "print np.max(filters)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first layer output, `conv1` (rectified responses of the filters above, first 36 only)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L0'].data[4, :1]\n", + "print np.linalg.norm(net.blobs['L0'].data[4, :1])\n", + "print np.max(net.blobs['L0'].data[4, :1])\n", + "vis_square_color(feat, padval=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['pool1'].data[4, :1]\n", + "print np.linalg.norm(feat)\n", + "print np.max(feat)\n", + "vis_square_color(feat, padval=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 6, + "metadata": {}, + "source": [ + "The second layer filters, `conv2`\n", + "\n", + "There are 256 filters, each of which has dimension 5 x 5 x 48. We show only the first 48 filters, with each channel shown separately, so that each filter is a row." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "filters = net.params['L1'][0].data\n", + "print filters.shape\n", + "vis_square(filters[:96].reshape(96*96, 5, 5))\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second layer output, `conv2` (rectified, only the first 36 of 256 channels)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L1'].data[4, :1]\n", + "#vis_square(feat, padval=1)\n", + "print np.linalg.norm(feat)\n", + "print np.max(feat)\n", + "vis_square_color(feat, padval=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The third layer output, `conv3` (rectified, all 384 channels)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L2'].data[4]\n", + "vis_square(feat, padval=0.5)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fourth layer output, `conv4` (rectified, all 384 channels)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L3'].data[4]\n", + "vis_square(feat, padval=0.5)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fifth layer output, `conv5` (rectified, all 256 channels)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L4'].data[4]\n", + "vis_square(feat, padval=0.5)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fifth layer after pooling, `pool5`" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['pool5'].data[4]\n", + "vis_square(feat, padval=1)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['L6d'].data[4]\n", + "print np.linalg.norm(net.blobs['L6d'].data[4])\n", + "print np.max(net.blobs['L6d'].data[4])\n", + "print np.min(net.blobs['L6d'].data[4])\n", + "vis_square(feat, padval=1)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute the pixel level confidence of presence of cars." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "feat = net.blobs['pixel-prob'].data[4]\n", + "image = net.deprocess('data', net.blobs['data'].data[4])\n", + "mask = feat[1, :, :]\n", + "\n", + "print np.max(mask)\n", + "\n", + "new_mask = np.zeros(mask.shape)\n", + "#new_mask[4:,4:] = mask[:-4,:-4]\n", + "\n", + "#zoomed_mask = scipy.ndimage.zoom(new_mask, 4, order=0)\n", + "zoomed_mask = scipy.ndimage.zoom(mask, 4, order=0)\n", + "\n", + "masked_image = image.transpose((2, 0, 1))\n", + "masked_image[0, :, :] += zoomed_mask\n", + "masked_image = np.clip(masked_image, 0, 1)\n", + "\n", + "masked_image = masked_image.transpose((1, 2, 0))\n", + "plt.imshow(masked_image)\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extract bounding boxes." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "hard_mask = np.round(mask + 0.25)\n", + "print np.sum(hard_mask)\n", + "feat = net.blobs['bb-output-tiled'].data[4][:4,:,:]\n", + "#depth = net.blobs['bb-output-tiled'].data[4][4,:,:]\n", + "#depth *= hard_mask\n", + "bb = np.copy(feat)\n", + "print bb.shape\n", + "for c in range(4):\n", + " bb[c, :, :] *= hard_mask\n", + " \n", + "y_offset = np.array([np.arange(0, 480, 4)]).T\n", + "y_offset = np.tile(y_offset, (1, 160))\n", + "x_offset = np.arange(0, 640, 4)\n", + "x_offset = np.tile(x_offset, (120, 1))\n", + "\n", + "# Old way of offseting x and y\n", + "#x_offset = np.array([np.arange(0, 640, 32)])\n", + "#x_offset = np.tile(x_offset, (15, 1))\n", + "#x_offset = scipy.ndimage.zoom(x_offset, 8, order=0)\n", + "#y_offset = np.array([np.arange(0, 480, 32)]).T\n", + "#y_offset = np.tile(y_offset, (1, 20))\n", + "#y_offset = scipy.ndimage.zoom(y_offset, 8, order=0)\n", + "#print y_offset.shape\n", + "\n", + "bb[0, :, :] += x_offset\n", + "bb[2, :, :] += x_offset\n", + "bb[1, :, :] += y_offset\n", + "bb[3, :, :] += y_offset\n", + "\n", + "selected_rects = hard_mask > 0\n", + "num_rects = np.sum(selected_rects)\n", + "rects = np.empty((num_rects, 4))\n", + "for i in range(4):\n", + " rects[:, i] = bb[i, selected_rects] #+ 16\n", + "rects = rects[np.logical_and((rects[:, 2] - rects[:, 0]) > 0, (rects[:, 3] - rects[:, 1]) > 0), :]\n", + "rects[:, (2, 3)] -= rects[:, (0, 1)]\n", + "rects = np.clip(rects, 0, 640)\n", + "rects = [rects[i, :] for i in range(rects.shape[0])]\n", + "rects, scores = cv2.groupRectangles(rects, 2, 0.15)\n", + "print rects.size\n", + "print scores.size\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Output distribution of bounding box labels." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "bb = np.copy(feat[:, :, :])\n", + "for c in range(4):\n", + " bb[c, :, :] *= hard_mask\n", + "plt.subplot(4, 1, 1)\n", + "plt.plot(feat.flat)\n", + "plt.subplot(4, 1, 2)\n", + "plt.plot(bb.flat)\n", + "plt.subplot(4, 1, 3)\n", + "_ = plt.hist(bb.flat[bb.flat != 0], bins=100)\n", + "plt.subplot(4, 1, 4)\n", + "#plt.plot(depth.flat)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Function for drawing rectangles in images." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def draw_rects(image, rects):\n", + " for i in range(rects.shape[0]):\n", + " xmin, ymin, w, h = rects[i, :]\n", + " xmax = xmin + w\n", + " ymax = ymin + h\n", + " image[ymin:ymax+1, xmin:xmin+2, 1] = 1\n", + " image[ymin:ymax+1, xmax:xmax+2, 1] = 1\n", + " image[ymin:ymin+2, xmin:xmax+1, 1] = 1\n", + " image[ymax:ymax+2, xmin:xmax+1, 1] = 1 \n", + " return image\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Draw both confidence and bounding boxes." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "boxed_image = np.copy(masked_image)\n", + "boxed_image = draw_rects(boxed_image, rects)\n", + "plt.imshow(boxed_image)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file