diff --git a/examples/stochastic_depth_layers.ipynb b/examples/stochastic_depth_layers.ipynb new file mode 100644 index 0000000..25459da --- /dev/null +++ b/examples/stochastic_depth_layers.ipynb @@ -0,0 +1,704 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import theano\n", + "from theano import tensor as T\n", + "import lasagne\n", + "from lasagne.layers import *\n", + "from lasagne.nonlinearities import *\n", + "from lasagne.objectives import *\n", + "from lasagne.regularization import *\n", + "from lasagne.random import get_rng\n", + "from lasagne.updates import *\n", + "from lasagne.init import *\n", + "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", + "from urllib import urlretrieve\n", + "import cPickle as pickle\n", + "import gzip\n", + "import imp\n", + "import os\n", + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class BinomialDropLayer(Layer):\n", + " def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5,\n", + " **kwargs):\n", + " super(BinomialDropLayer, self).__init__(incoming, **kwargs)\n", + " self.nonlinearity = (identity if nonlinearity is None\n", + " else nonlinearity)\n", + " self._srng = RandomStreams(get_rng().randint(1, 2147462579))\n", + " self.p = 1-survival_p\n", + "\n", + " def get_output_for(self, input, deterministic=False, **kwargs):\n", + " if deterministic:\n", + " return self.p*input\n", + " else:\n", + " #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),\n", + " # dtype=input.dtype)\n", + " mask = T.zeros((input.shape[0],)) + self._srng.uniform( (1,), 0, 1)[0]\n", + " mask = mask.dimshuffle(0,'x','x','x')\n", + " return mask*input" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class IfElseDropLayer(Layer):\n", + " def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5,\n", + " **kwargs):\n", + " super(IfElseDropLayer, self).__init__(incoming, **kwargs)\n", + " self.nonlinearity = (identity if nonlinearity is None\n", + " else nonlinearity)\n", + " self._srng = RandomStreams(get_rng().randint(1, 2147462579))\n", + " self.p = 1-survival_p\n", + "\n", + " def get_output_for(self, input, deterministic=False, **kwargs):\n", + " if deterministic:\n", + " return self.p*input\n", + " else:\n", + " return ifelse(\n", + " T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p),\n", + " input,\n", + " T.zeros(input.shape)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a difference between this residual block method and the one that is defined in [link]. When the number of filters is different to the layer's output shape (or the stride is different), instead of using a convolution to make things compatible, we use an average pooling with a pool size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature maps. This is because this is how the authors in [link] have defined it." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\n\\nprint('Building model...')\\nmodel = nn.Sequential()\\n------> 3, 32,32\\nmodel:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1)\\n :init('weight', nninit.kaiming, {gain = 'relu'})\\n :init('bias', nninit.constant, 0))\\nmodel:add(cudnn.SpatialBatchNormalization(16))\\nmodel:add(cudnn.ReLU(true))\\n------> 16, 32,32 First Group\\nfor i=1,opt.N do addResidualDrop(model, nil, 16) end\\n------> 32, 16,16 Second Group\\naddResidualDrop(model, nil, 16, 32, 2)\\nfor i=1,opt.N-1 do addResidualDrop(model, nil, 32) end\\n------> 64, 8,8 Third Group\\naddResidualDrop(model, nil, 32, 64, 2)\\nfor i=1,opt.N-1 do addResidualDrop(model, nil, 64) end\\n------> 10, 8,8 Pooling, Linear, Softmax\\nmodel:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64))\\nif opt.dataset == 'cifar10' or opt.dataset == 'svhn' then\\n model:add(nn.Linear(64, 10))\\nelseif opt.dataset == 'cifar100' then\\n model:add(nn.Linear(64, 100))\\nelse\\n print('Invalid argument for dataset!')\\nend\\n\\n\\n\"" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "\n", + "print('Building model...')\n", + "model = nn.Sequential()\n", + "------> 3, 32,32\n", + "model:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1)\n", + " :init('weight', nninit.kaiming, {gain = 'relu'})\n", + " :init('bias', nninit.constant, 0))\n", + "model:add(cudnn.SpatialBatchNormalization(16))\n", + "model:add(cudnn.ReLU(true))\n", + "------> 16, 32,32 First Group\n", + "for i=1,opt.N do addResidualDrop(model, nil, 16) end\n", + "------> 32, 16,16 Second Group\n", + "addResidualDrop(model, nil, 16, 32, 2)\n", + "for i=1,opt.N-1 do addResidualDrop(model, nil, 32) end\n", + "------> 64, 8,8 Third Group\n", + "addResidualDrop(model, nil, 32, 64, 2)\n", + "for i=1,opt.N-1 do addResidualDrop(model, nil, 64) end\n", + "------> 10, 8,8 Pooling, Linear, Softmax\n", + "model:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64))\n", + "if opt.dataset == 'cifar10' or opt.dataset == 'svhn' then\n", + " model:add(nn.Linear(64, 10))\n", + "elseif opt.dataset == 'cifar100' then\n", + " model:add(nn.Linear(64, 100))\n", + "else\n", + " print('Invalid argument for dataset!')\n", + "end\n", + "\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, survival_p=0.5):\n", + " #print \"input =\", layer.output_shape\n", + " conv = layer\n", + " if (num_filters != layer.output_shape[1]) or (stride != 1):\n", + " layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode=\"average_inc_pad\")\n", + " diff = num_filters-layer.output_shape[1]\n", + " if diff % 2 == 0: \n", + " width_tp = ((diff/2, diff/2),)\n", + " else:\n", + " width_tp = (((diff/2)+1, diff/2),)\n", + " layer = pad(\n", + " layer, \n", + " batch_ndim=1, \n", + " width=width_tp\n", + " )\n", + " #print \"layer =\", layer.output_shape\n", + " for _ in range(num_layers):\n", + " conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same')\n", + " #print \"conv =\", conv.output_shape\n", + " stride = 1\n", + " nonlinearity = conv.nonlinearity\n", + " conv.nonlinearity = lasagne.nonlinearities.identity\n", + " conv = BinomialDropLayer(conv, survival_p=survival_p)\n", + " return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# architecture from:\n", + "# https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua\n", + "survival_p = 0.5\n", + "layer = InputLayer( (None, 3, 32, 32) )\n", + "layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same')\n", + "#layer = Pool2DLayer(layer, 2)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 16, survival_p=survival_p)\n", + "layer = residual_block(layer, 32, stride=2, survival_p=survival_p)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 32, survival_p=survival_p)\n", + "layer = residual_block(layer, 64, stride=2, survival_p=survival_p)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 64, survival_p=survival_p)\n", + "layer = Pool2DLayer(layer, pool_size=8, stride=1, mode=\"average_inc_pad\")\n", + "layer = DenseLayer(layer, num_units=10, nonlinearity=softmax)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (None, 3, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x108d01790> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x14a3a4d10> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x14a510050> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x147e24650> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x12049b750> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bc150> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204baf50> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204ba8d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204ba5d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bc650> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bce50> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x12049b1d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204882d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120488310> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120175050> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120175990> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1201754d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120160f90> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120160ed0> (None, 32, 16, 16)\n", + " (None, 16, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120160690> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1201458d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120145f90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1201452d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014e350> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014ed10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014e510> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dab90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dad50> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204da1d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204db5d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dbd90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204db350> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ffd10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ffa50> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ff1d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204f5e10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204f5a10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x1204f5190> (None, 64, 8, 8)\n", + " (None, 32, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db5e50> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db5a10> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db53d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1910> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1650> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1490> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124da0350> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124da0710> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x1145df510> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x149a291d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x114702690> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x125a9dc90> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x12758b110> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x12d30d310> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x108d0c850> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x108db92d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x14978b2d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124d8cad0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 1, 1)\n", + " (None, 10)\n" + ] + } + ], + "source": [ + "for layer in get_all_layers(layer):\n", + " print layer, layer.output_shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cifar10_loader = imp.load_source(\"cifar10_loader\", \"../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#curr_dir = os.getcwd()\n", + "#os.chdir(\"../papers/deep_residual_learning/\")\n", + "data = cifar10_loader.load_data()\n", + "X_train_and_valid, y_train_and_valid, X_test, y_test = \\\n", + " data[\"X_train\"][0:50000], data[\"Y_train\"][0:50000], data[\"X_test\"], data[\"Y_test\"]\n", + "#os.chdir(curr_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " if __name__ == '__main__':\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " from ipykernel import kernelapp as app\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " app.launch_new_instance()\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n" + ] + } + ], + "source": [ + "X_train = X_train_and_valid[ 0 : 0.9*X_train_and_valid.shape[0] ]\n", + "y_train = y_train_and_valid[ 0 : 0.9*y_train_and_valid.shape[0] ]\n", + "X_valid = X_train_and_valid[ 0.9*X_train_and_valid.shape[0] :: ]\n", + "y_valid = y_train_and_valid[ 0.9*y_train_and_valid.shape[0] :: ]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(90000, 3, 32, 32)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X = T.tensor4('X')\n", + "y = T.ivector('y')\n", + "\n", + "net_out = get_output(l_out, X)\n", + "net_out_det = get_output(l_out, X, deterministic=True)\n", + "loss = categorical_crossentropy(net_out, y).mean()\n", + "params = get_all_params(l_out, trainable=True)\n", + "grads = T.grad(loss, params)\n", + "updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9)\n", + "train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)\n", + "eval_fn = theano.function(inputs=[X, y], outputs=loss)\n", + "preds_fn = theano.function(inputs=[X], outputs=net_out_det)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#X_train = X_train[0:10]\n", + "#y_train = y_train[0:10]\n", + "#X_valid = X_valid[0:10]\n", + "#y_valid = y_valid[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch,avg_train_loss,valid_loss,valid_acc,time\n", + "1,nan,inf,0.200000,0.808203\n", + "2,nan,inf,0.200000,0.669914\n", + "3,nan,inf,0.200000,0.633933\n", + "4,nan,inf,0.200000,0.646192\n", + "5,nan,inf,0.200000,0.618688\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtrain_losses\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0mtrain_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mvalid_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0meval_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mvalid_preds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mvalid_acc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalid_preds\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"%i,%f,%f,%f,%f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_losses\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_acc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mt0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m//anaconda/lib/python2.7/site-packages/Theano-0.8.0.dev0-py2.7.egg/theano/compile/function_module.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0mt0_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'position_of_error'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m//anaconda/lib/python2.7/site-packages/Theano-0.8.0.dev0-py2.7.egg/theano/gof/op.pyc\u001b[0m in \u001b[0;36mrval\u001b[0;34m(p, i, o, n)\u001b[0m\n\u001b[1;32m 905\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNoParams\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 906\u001b[0m \u001b[0;31m# default arguments are stored in the closure of `rval`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 907\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mrval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode_input_storage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode_output_storage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 908\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 909\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "batch_size = 128\n", + "n_batches = X_train.shape[0] // batch_size\n", + "num_epochs = 10\n", + "print \"epoch,avg_train_loss,valid_loss,valid_acc,time\"\n", + "for epoch in range(0, num_epochs):\n", + " t0 = time()\n", + " train_losses = []\n", + " for b in range(0, n_batches):\n", + " train_losses.append( train_fn(X_train[b*bs:(b+1)*bs], y_train[b*bs:(b+1)*bs]) )\n", + " valid_loss = eval_fn(X_valid, y_valid)\n", + " valid_preds = np.argmax(preds_fn(X_valid),axis=1)\n", + " valid_acc = np.sum(valid_preds == y_valid)*1.0 / len(y_valid)\n", + " print \"%i,%f,%f,%f,%f\" % (epoch+1, np.mean(train_losses), valid_loss, valid_acc, time()-t0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py new file mode 100644 index 0000000..d75e9d0 --- /dev/null +++ b/examples/stochastic_depth_layers.py @@ -0,0 +1,170 @@ +import theano +from theano import tensor as T +from theano.ifelse import ifelse +import lasagne +from lasagne.layers import * +from lasagne.nonlinearities import * +from lasagne.objectives import * +from lasagne.regularization import * +from lasagne.random import get_rng +from lasagne.updates import * +from lasagne.init import * +from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams +import imp +import os +from time import time + +""" + Binomial dropout layer + + Samples binomial(p,n=1) R.V. and multiplies the input tensor by + this value. On its own, this layer is useless as it + essentially either multiplies everything by one (i.e. do nothing), + or it makes every value in the tensor zero (lose all information). + This layer is intended to be used. + + Parameters + ---------- + + incoming : a :class:`Layer` instance + p : float + The survival probability for the input tensor + +""" +class BinomialDropLayer(Layer): + def __init__(self, incoming, p=0.5, **kwargs): + super(BinomialDropLayer, self).__init__(incoming, **kwargs) + self._srng = RandomStreams(get_rng().randint(1, 2147462579)) + self.p = p + + def get_output_for(self, input, deterministic=False, **kwargs): + if deterministic: + return self.p*input + else: + #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],), + # dtype=input.dtype) + # apply the same thing to all examples in the minibatch + mask = T.zeros((input.shape[0],)) + self._srng.binomial((1,), p=self.p, dtype=input.dtype)[0] + mask = mask.dimshuffle(0,'x','x','x') + return mask*input + +class IfElseDropLayer(Layer): + def __init__(self, incoming, p=0.5, **kwargs): + super(IfElseDropLayer, self).__init__(incoming, **kwargs) + self._srng = RandomStreams(get_rng().randint(1, 2147462579)) + self.p = p + + def get_output_for(self, input, deterministic=False, **kwargs): + if deterministic: + return self.p*input + else: + return ifelse( + T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), + input, + T.zeros(input.shape) + ) + + +""" +There is a difference between this residual block method and the one that is defined in: + +https://github.com/Lasagne/Lasagne/issues/531 + +When the number of filters is different to the layer's output shape (or the stride is different), +instead of using a convolution to make things compatible, we use an average pooling with a pool +size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature +maps. This is because this is how the authors in the following link have defined it: + +https://github.com/yueatsprograms/Stochastic_Depth/blob/master/ResidualDrop.lua +""" + +def residual_block(layer, n_out_channels, stride=1, survival_p=0.5): + conv = layer + if stride > 1: + layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode="average_inc_pad") + if (n_out_channels != layer.output_shape[1]): + diff = n_out_channels-layer.output_shape[1] + if diff % 2 == 0: + width_tp = ((diff/2, diff/2),) + else: + width_tp = (((diff/2)+1, diff/2),) + layer = pad(layer, batch_ndim=1, width=width_tp) + conv = Conv2DLayer(conv, num_filters=n_out_channels, + filter_size=(3,3), stride=(stride,stride), pad=(1,1), nonlinearity=linear) + conv = BatchNormLayer(conv) + conv = NonlinearityLayer(conv, nonlinearity=rectify) + conv = Conv2DLayer(conv, num_filters=n_out_channels, + filter_size=(3,3), stride=(1,1), pad=(1,1), nonlinearity=linear) + conv = BatchNormLayer(conv) + conv = BinomialDropLayer(conv, p=survival_p) + return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity=rectify) + +def get_net(): + # Architecture from: + # https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua + N = 18 + survival_p = 0.5 + layer = InputLayer( (None, 3, 32, 32) ) + layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same') + #layer = Pool2DLayer(layer, 2) + for _ in range(N): + layer = residual_block(layer, 16, survival_p=survival_p) + layer = residual_block(layer, 32, stride=2, survival_p=survival_p) + for _ in range(N): + layer = residual_block(layer, 32, survival_p=survival_p) + layer = residual_block(layer, 64, stride=2, survival_p=survival_p) + for _ in range(N): + layer = residual_block(layer, 64, survival_p=survival_p) + layer = Pool2DLayer(layer, pool_size=8, stride=1, mode="average_inc_pad") + layer = DenseLayer(layer, num_units=10, nonlinearity=softmax) + for layer in get_all_layers(layer): + print layer, layer.output_shape + print "number of params:", count_params(layer) + return layer + +cifar10_loader = imp.load_source( + "cifar10_loader", "../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py") +curr_dir = os.getcwd() +os.chdir("../papers/deep_residual_learning/") +data = cifar10_loader.load_data() +X_train_and_valid, y_train_and_valid, X_test, y_test = \ + data["X_train"][0:50000], data["Y_train"][0:50000], data["X_test"], data["Y_test"] +os.chdir(curr_dir) + +X_train = X_train_and_valid[ 0 : 0.9*X_train_and_valid.shape[0] ] +y_train = y_train_and_valid[ 0 : 0.9*y_train_and_valid.shape[0] ] +X_valid = X_train_and_valid[ 0.9*X_train_and_valid.shape[0] :: ] +y_valid = y_train_and_valid[ 0.9*y_train_and_valid.shape[0] :: ] + +X = T.tensor4('X') +y = T.ivector('y') + +layer = get_net() +net_out = get_output(layer, X) +net_out_det = get_output(layer, X, deterministic=True) +loss = categorical_crossentropy(net_out, y).mean() +params = get_all_params(layer, trainable=True) +grads = T.grad(loss, params) +updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9) +train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates) +eval_fn = theano.function(inputs=[X, y], outputs=loss) +preds_fn = theano.function(inputs=[X], outputs=net_out_det) + +bs = 128 +n_batches = X_train.shape[0] // bs +num_epochs = 10 +print "epoch,avg_train_loss,valid_loss,valid_acc,time" +for epoch in range(0, num_epochs): + # shuffle examples + idxs = [x for x in range(0, X_train.shape[0])] + np.random.shuffle(idxs) + X_train = X_train[idxs] + y_train = y_train[idxs] + train_losses = [] + t0 = time() + for b in range(0, n_batches): + train_losses.append( train_fn(X_train[b*bs:(b+1)*bs], y_train[b*bs:(b+1)*bs]) ) + valid_loss = eval_fn(X_valid, y_valid) + valid_preds = np.argmax(preds_fn(X_valid),axis=1) + valid_acc = np.sum(valid_preds == y_valid)*1.0 / len(y_valid) + print "%i,%f,%f,%f,%f" % (epoch+1, np.mean(train_losses), valid_loss, valid_acc, time()-t0)