-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.html
922 lines (725 loc) · 44.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<title>Learning where to look: a foveated visuomotor control model - CNS*2019</title>
<meta name="description" content="Learning where to look: <BR>A foveated visuomotor control model">
<meta name="author" content="Emmanuel Daucé, Pierre Albigès & Laurent Perrinet">
<meta name="apple-mobile-web-app-capable" content="yes" >
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<!-- General and theme style sheets -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/css/reveal.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/css/theme/simple.css" id="theme">
<!-- Theme used for syntax highlighting of code -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/css/print/pdf.css' : 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/lib/js/html5shiv.js"></script>
<![endif]-->
<!-- Get Font-awesome from cdn -->
<!-- <link rel="stylesheet" href="http://netdna.bootstrapcdn.com/font-awesome/4.1.0/css/font-awesome.css"> -->
</head>
<body>
<div class="reveal">
<div class="slides">
<section><section>
<h2 class="title">Learning where to look: <BR>A foveated visuomotor control model</h2>
<h3><a href="https://laurentperrinet.github.io/talk/2019-07-15-cns/">Emmanuel Daucé, Pierre Albigès & Laurent Perrinet</a></h3>
<img class="plain" data-src="figures/ins-logo.png" height="245.76px" /><img class="plain" data-src="http://laurentperrinet.github.io/slides.py/figures/troislogos.png" height="327.68px" />
<h4><a href="https://www.cnsorg.org/cns-2019">CNS*2019</a>, 15/7/2019 </h4>
<aside class="notes">
<ul>
<li>(AUTHOR) Hello, I am Emmanuel Daucé from the Institute of Neurosciences of Systems in Marseille, a joint unit from the CNRS and the AMU. This is joint work with Pierre Albiges and Laurent Perrinet from the Institute of Neurosciences of la Timone also in Marseille</li>
</ul>
</aside>
</section>
<section><h3>Outline</h3>
<ol>
<h3>
<li>
<p class="fragment highlight-red">
Motivation
</p>
</li>
</h3>
<h3>
<li>
Methods
</li>
</h3>
<h3>
<li>
Results
</li>
</h3>
<h3>
<li>
Conclusion
</li>
</h3>
</ol>
<aside class="notes">
</aside>
</section>
<section><h3>Computer vision</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-2-general-A.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<ul>
<li>(OBJECTIVE)</li>
</ul>
<p>Past 5-10 years have seen a huge development of machine learning/deep learning based image processing, indeed artificial vision has been revolutioned by
the incredible capability of convolution-based deep networks to capture the semantic content of images/photographs. Their success relies on a reduction of parameter complexity
through weight sharing in convolutional neural networks applied over the full image. In order to increase the recognition capability, there has been an inflation in the number of layers needed
to process the pixel information. Finally, the processing of large images can be done at a cost that scales quadratically with the image resolution. All regions, even the “boring” ones are
systematically scanned and processed in parallel fashion at high computational cost.</p>
<p>Typical ML processing :
- bounding boxes around objects of interest
- (at best) Linear scaing in #pixels</p>
</aside>
</section>
<section><h3>Human vision</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-2-general-B.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<ul>
<li>(OBJECTIVE)</li>
</ul>
<p>In contrast, when human vision is considered, things work quite differently.
Indeed, human (and animal) vision rely on a non isotropic sensor (the retina) that has a very high resolution at the center of fixation and a very poor resolution at the periphery.</p>
<p>Crucially, the human vision is <strong>dynamic</strong>. The scanning of a full visual scene is not done in parallel but sequentially, and only scene-relevant regions of interest are scanned through saccades. This implies a <strong>decision process</strong> at each step that decides <strong>where to look next</strong>.</p>
<p>We propose here that such a strategy ("non isotropic" convolution) allows for an economic processing of images by processing independently the position from the category of objects</p>
</aside>
</section>
<section><h3>Statistical Viewpoint</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS - Modelling - I.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>This kind of reasoning can be captured by a statistical framework called a POMDP (partially observed Markov Decision Process) where the cause of a visual field is couple made of
a viewpoint and scene elements. Changing the viewpoint will conduct to a different scene rendering. Knowing the current view, you need to choose the next viewpoint that will help you to
disambiguate the scene.</p>
<p>In a classic inference framework, a (generative) model tells how typically looks the visual field knowing the scene elements and a certain viewpoint . Using bayes rule, you may then infer the scene elements from the
current view point (model inversion).</p>
<p>The more viewpoints you have, the more certain you are about the content of the scene.</p>
</aside>
</section>
<section><h3>Attention vs. Scene Understanding</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS - Modelling - II.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>Given a generative model of the environment, one can define a quantity called the bayesian surprise that tells how different is the visual data from your initial guess.
Itti and Koch predict that that the eye is attracted by the bayesian surprise, i.e. by the regions of the image that depart the most from the baseline image statistics.
This allows to define salient regions in an image and draw saliency maps over an image that can predict where the eye is attracted the most. This may explain up to 50% of the human scan path, but it is purely phenomenological.</p>
<ul>
<li>Laurent Itti and Christof Koch. <strong>A saliency-based search mechanism
for overt and covert shifts of visual attention</strong>. In: Vision
Research 40.10-12 (2000), pp. 1489--1506.</li>
<li>M. Kümmerer, L. Theis, and M. Bethge <strong>Deep Gaze I: Boosting
Saliency Prediction with Feature Maps Trained on ImageNet</strong> ICLR
Workshop, 2015</li>
</ul>
<p>Top down : (sequential decision)</p>
<p>A more detailed modelling originally proposed by Najemnik and Geisler proposes a sequential model of natural vision in a visual search task. Given a generative model of the visual field (an ideal observer that knows everything about how the visual data is generated), and given a statistics over the hypothesis space (Where is Waldo?), the model decides <strong>where to look next</strong> : choose the next viewpoint that will provide the best <strong>information gain</strong>. The selection is reiterated several times until enough evidence is gathered.</p>
<p>In general, the active inference setup means using a generative model to quantify the benefit of doing a certain action (changing viewpoint) to reduce the <strong>posterior entropy</strong> given an history of past actions (viewpoints), that corresponds to a better understanding of the visual scene.</p>
<ul>
<li>J Najemnik and Wilson S. Geisler. <strong>Optimal eye movement
strategies in visual search</strong>. In: Nature reviews. Neuroscience
434 (2005)</li>
<li>Nicholas J Butko and Javier R Movellan. <strong>Infomax control of eye
movements</strong>. In: Autonomous Mental Development, IEEE
Transactions on 2.2 (2010)</li>
<li>Fu, J., Zheng, H., & Mei, T. (2017). Look closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4438-4446).</li>
</ul>
</aside>
</section>
</section>
<section><section><h3>Outline</h3>
<ol>
<h3>
<li>
Motivation
</li>
</h3>
<h3>
<li>
<p class="fragment highlight-red">
Methods
</p>
</li>
</h3>
<h3>
<li>
Results
</li>
</h3>
<h3>
<li>
Conclusion
</li>
</h3>
</ol>
<aside class="notes">
<p>Indeed, we will use the separation of the 2 problemes (where and what) as they are confronted to nuisances of different kinds</p>
</aside>
</section>
<section><h3>Principles for central and peripheric vision</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-what-where-principles.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>So what we propose here is to go a little further in a biomimetic implementation of an artificial vision system.
(Why : biomimetic systems are the result of a continual optimization throughout ages of evolution: they optimize signal processing under strong material and energy constraints, for specific surfival purposes.)
Objective : build an effective artificial foveal vision
We concentrate her on the foveal vision case</p>
<p>What is specific with foveal vision?
Foveal vision is a trick that was selected by natural selection : a compromise between resource saving and accuracy (budgeted vision)
The fovea that concentrates most of the photoreceptors, represents less than 2% of the total visual field
In a foveal vision setting, the current view may allow you to tell there is an object of interest in your peripheral vision (for instance a face),that you can not identify, and you need to make a saccade to
identify the person.</p>
<p>So in order to analyze a complex visual scene, there are two types of processing that need to be done. On the one side, you need to process in detail what is at the center of fixation, that is the region of interest currently processed. On the other side, you also need to analyze the surrounding part, even if the resolution is low, in order to choose what is the next position of fixation. This basically means making a choice of “what’s interesting next”. You do not necessarily need to know what it is, but you need to that it’s interesting enough, and of course you need to know what action to take to move the center of fixation at the right position.</p>
<p>If we consider now the information gain metric, it shows an interesting correspondence with the central/peripheral processing trade-off. In a sequential setup, the rightmost term can be interpreted as the current state if understanding before the saccade is actuated, that is the information present at the center of the retina -- and the left term can be seen as the future state of understanding after the saccade is executed, that relies on interpreting the peripheral information.</p>
</aside>
</section>
<section data-background="figures/film_FIX.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display0.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display0_SAC.png" data-background-size="1280px">
</section>
<section data-background="figures/film_ANS.png" data-background-size="1280px">
</section>
<section data-background="figures/film_FIX.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display4.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display4_SAC.png" data-background-size="1280px">
</section>
<section data-background="figures/film_ANS.png" data-background-size="1280px">
</section>
<section data-background="figures/film_FIX.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display8.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display8_SAC.png" data-background-size="1280px">
</section>
<section data-background="figures/film_ANS.png" data-background-size="1280px">
</section>
<section data-background="figures/film_FIX.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display9.png" data-background-size="1280px">
</section>
<section data-background="figures/film_display9_SAC.png" data-background-size="1280px">
</section>
<section data-background="figures/film_ANS.png" data-background-size="1280px">
</section>
<section><h3>Methods - ''Experimental'' setup</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=737.2800000000001>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=737 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/fig_intro.jpg" height="737px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>We reproduce in simulation the conditions of a psychophysic experiment.</p>
<p>The problem is to identify a digit that is placed at random over a noisy background, that is : finding the target identity. The agent fixates the center of the screen should give an answer about which digit was present in the image.
This corresponds to a classic environment control in psychophysic experiments.
Different parameters are controlled, such as the target eccentricity, the background noise and the contrast, in order to var the difficulty of the task.</p>
<p>(B) the agent can make a saccade, in which case the center of fixation moves toward the expected location of the target.</p>
<p>(C) The agent subjective perception is shown on the lower right part. The larger the target eccentrity, the more difficult the identifiction. There is a range of eccentricities for wich it is impossible to identify the target from a single glance, so that a sacade is necessary to issue a propoer response.</p>
<p>DONE-Laurent = génére les frames pour un "film"</p>
</aside>
</section>
<section><h3>Methods: What/Where separation</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-what-where.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>Consider our simplified visual scene containing a non-centered digit over a noisy background. We consider a separate processing of the central part of the visual field and the periphery, corresponding to a central and peripheral processing consistently with information-gain based action selection.</p>
<p>We consider in our setup a slight simplification, that is sampling the prior and the posterior on the true label.
The information gain becomes the difference of the future accuracy and the central accuracy.
The accuracy here takes the role of a proxy for the posterior entropy.
Importantly, the future accuracy is a score that does not predict the future label. It just tells how correct the response will be while doing saccade a.</p>
<p>The separation into current accuracy and future accuracy is reminiscent of the What/where visual processing separation observed in monkeys and humans... with a separate processing of the object detailed shape and identity through the ventral pathway and the visuo-spatial information through the dorsal pathway.
Here we interpret the what/where separation in a slightly different manner, with the what devoted to analyzing the central part of the visual field, and the where devoted to choosing the next saccade.
The "Where" is not exactly where but rather: where should I look next in order to increase my accuracy?</p>
</aside>
</section>
<section><h3>Methods: Computational Graph</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-what-where-diagram.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code>COMPUTATIONAL GRAPH :
Here is the general computational graph of our active vision system.
Two streams of information are separated from the visual primary layers, one stream for processing the central pixels only, the other for processing the periphery with a logpolar encoding. The two streams converge toward a decision layer that compares the central and the peripheral acuracy, in order to decide wether to issue a saccadic or a categorical response. If a saccade is produced, then the center of vision is displaced toward the region that shows the higher accuracy on the accuracy map.
WHAT :
At the core of the vision system is the identification module (the what). The what pathway is a classic convolutional clasifier.
</code></pre>
<p>It shows some translation invariance. It can quantify its uncertainty. It monitors the where pathway.</p>
<pre><code>TODO: mettre le résultat de l'accuracy map pour faire la transition?
WHERE :
Here we make the assumption that the same logpolar compression pattern is conserved from the retina up to the primary motor layers.
**Each possible future saccade has an expected accuracy, that can be trained from the what pathway output**. To accelerate the training, we use a shortcut that is training the network on a translated accuracy map (with logpolar encoding). The ouput is thus a **logpolar accuracy map**, that tells for each possible visuo-motor displacement the value of the future accuracy.
Thus, the saccadic motor ouput (colliculus) shows a similar log-polar compression than the visual input. The saccades are more precise at short than at long distance (and severals accades may be necessary to precisely reach distant targets).
</code></pre>
</aside>
</section>
<section><h3>Methods: What</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-what-diagram.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code>COMPUTATIONAL GRAPH :
Here is the general computational graph of our active vision system.
Two streams of information are separated from the visual primary layers, one stream for processing the central pixels only, the other for processing the periphery with a logpolar encoding. The two streams converge toward a decision layer that compares the central and the peripheral acuracy, in order to decide wether to issue a saccadic or a categorical response. If a saccade is produced, then the center of vision is displaced toward the region that shows the higher accuracy on the accuracy map.
WHAT :
At the core of the vision system is the identification module (the what). The what pathway is a classic convolutional clasifier.
</code></pre>
<p>It shows some translation invariance. It can quantify its uncertainty. It monitors the where pathway.</p>
<pre><code>TODO: mettre le résultat de l'accuracy map pour faire la transition?
WHERE :
Here we make the assumption that the same logpolar compression pattern is conserved from the retina up to the primary motor layers.
**Each possible future saccade has an expected accuracy, that can be trained from the what pathway output**. To accelerate the training, we use a shortcut that is training the network on a translated accuracy map (with logpolar encoding). The ouput is thus a **logpolar accuracy map**, that tells for each possible visuo-motor displacement the value of the future accuracy.
Thus, the saccadic motor ouput (colliculus) shows a similar log-polar compression than the visual input. The saccades are more precise at short than at long distance (and severals accades may be necessary to precisely reach distant targets).
</code></pre>
</aside>
</section>
<section><h3>Methods: Where</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-where-diagram.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code>COMPUTATIONAL GRAPH :
Here is the general computational graph of our active vision system.
Two streams of information are separated from the visual primary layers, one stream for processing the central pixels only, the other for processing the periphery with a logpolar encoding. The two streams converge toward a decision layer that compares the central and the peripheral acuracy, in order to decide wether to issue a saccadic or a categorical response. If a saccade is produced, then the center of vision is displaced toward the region that shows the higher accuracy on the accuracy map.
WHAT :
At the core of the vision system is the identification module (the what). The what pathway is a classic convolutional clasifier.
</code></pre>
<p>It shows some translation invariance. It can quantify its uncertainty. It monitors the where pathway.</p>
<pre><code>TODO: mettre le résultat de l'accuracy map pour faire la transition?
WHERE :
Here we make the assumption that the same logpolar compression pattern is conserved from the retina up to the primary motor layers.
**Each possible future saccade has an expected accuracy, that can be trained from the what pathway output**. To accelerate the training, we use a shortcut that is training the network on a translated accuracy map (with logpolar encoding). The ouput is thus a **logpolar accuracy map**, that tells for each possible visuo-motor displacement the value of the future accuracy.
Thus, the saccadic motor ouput (colliculus) shows a similar log-polar compression than the visual input. The saccades are more precise at short than at long distance (and severals accades may be necessary to precisely reach distant targets).
</code></pre>
</aside>
</section>
</section>
<section><section><h3>Outline</h3>
<ol>
<h3>
<li>
Motivation
</li>
</h3>
<h3>
<li>
Methods
</li>
</h3>
<h3>
<li>
<p class="fragment highlight-red">
Results
</p>
</li>
</h3>
<h3>
<li>
Conclusion
</li>
</h3>
</ol>
<aside class="notes">
<p>Indeed, t...</p>
</aside>
</section>
<section><h3>Results: success</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=512.0>
<tr style="vertical-align:middle" bgcolor="white" height="256px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-saccade-8.png" height="256px" />
</p>
</td>
</tr>
<tr style="vertical-align:middle" bgcolor="white" height="256px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-saccade-20.png" height="256px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>TODO Manu : générer images correctes avec leur saccades + incorrectes (fake)</p>
</aside>
</section>
<section><h3>Results: failure</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=614.4>
<tr style="vertical-align:middle" bgcolor="white" height="204px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-saccade-46.png" height="204px" />
</p>
</td>
</tr>
<tr style="vertical-align:middle" bgcolor="white" height="204px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-saccade-47.png" height="204px" />
</p>
</td>
</tr>
<tr style="vertical-align:middle" bgcolor="white" height="204px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-saccade-32.png" height="204px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>TODO Manu : générer images correctes avec leur saccades + incorrectes (fake)</p>
<p>TODO Manu : je mettrais plus d'exemple de fakes</p>
</aside>
</section>
<section><h3>Results: one saccade</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=512.0>
<tr style="vertical-align:middle" bgcolor="white" height="512px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/fig_result_robust_contrast_linear_0.7_1.svg" height="512px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code>TODO Manu : insérer résultats avec différents contrastes
</code></pre>
</aside>
</section>
<section><h3>Results: role of contrast</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr style="vertical-align:middle" bgcolor="white" height="921px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-results-contrast.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code>TODO Manu : insérer résultats avec différents contrastes
</code></pre>
</aside>
</section>
<section><h3>Results: more saccades</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr style="vertical-align:middle" bgcolor="white" height="921px">
<td width="100%" style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-results-saccades.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<p>TODO Manu : insérer résultats avec différents contrastes</p>
</aside>
</section>
<section><h3>IG-based selection of action</h3>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=921.6>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=921 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/CNS-IG-action-selection.svg" height="921px" />
</p>
</td>
</tr>
</table>
</div>
<aside class="notes">
<pre><code> done
</code></pre>
</aside>
</section>
</section>
<section><section><h3>Outline</h3>
<ol>
<h3>
<li>
Motivation
</li>
</h3>
<h3>
<li>
Methods
</li>
</h3>
<h3>
<li>
Results
</li>
</h3>
<h3>
<li>
<p class="fragment highlight-red">
Conclusion
</p>
</li>
</h3>
</ol>
<aside class="notes">
<p>Indeed, we will use the separation of the 2 problemes (where and what) as they are confronted to nuisances of different kinds</p>
</aside>
</section>
<section data-markdown>
<script type="text/template">
## Main results:
- A new interpretation of Information Gain in visuo-motor action selection :
- Center-surround interpretation
- An effective decoding scheme with strong bandwidth reduction
- Information-gain based selection of action (saccade/pursuit)
- A sub-linear object detection for image processing:
- A full log-polar processing pathway (from early vision toward action selection)
- Sequential info gain converges to zero: in practice 2-3 saccades are enough
- Ready for up-scaling
- Object identity-based monitoring of action
- Dorsal = ''actor'' (where to look next?)
- Ventral = ''critic'' (for what to see?)
</script>
</section>
<section data-markdown>
<script type="text/template">
## Limits and Open questions
- Importance of centering objects:
- Central object referential
- log polar scale/rotation invariance
- (feedback) prediction
- Information Gain-based decision :
- Sequential info gain converges to zero: in practice 2-3 saccades are enough
- Pursuit vs. saccade.
- Maximizing info gain on multiple targets/ddls.
- Overt/covert attention
- Inhibition of return
</script>
</section>
<section>
<h2 class="title">Learning where to look: <BR>A foveated visuomotor control model</h2>
<h3><a href="https://laurentperrinet.github.io/talk/2019-07-15-cns/">Emmanuel Daucé, Pierre Albigès & Laurent Perrinet</a></h3>
<img class="plain" data-src="figures/ins-logo.png" height="245.76px" /><img class="plain" data-src="http://laurentperrinet.github.io/slides.py/figures/troislogos.png" height="327.68px" />
<h4><a href="https://www.cnsorg.org/cns-2019">CNS*2019</a>, 15/7/2019 </h4>
<aside class="notes">
<ul>
<li>Thanks for your attention!</li>
</ul>
</aside>
</section>
<section>
<div align="center">
<table border=0px VALIGN="center" bgcolor=white height=829.44>
<tr padding=0px style="vertical-align:middle" bgcolor=white>
<td height=829 width="1280" padding-top=0px padding-bottom=0px style="text-align:center; vertical-align:middle" bgcolor="white" >
<p>
<img class="plain" data-src="figures/qr.png" height="829px" />
</p>
</td>
</tr>
</table>
</div>
<BR><a href="https://laurentperrinet.github.io/talk/2019-07-15-cns"> https://laurentperrinet.github.io/talk/2019-07-15-cns </a>
<aside class="notes">
<p>All the material is available online - please flash this code this leads to a page with links to further references and code - TODO : use ArXiV instead </p>
</aside>
</section>
</section>
</div>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/lib/js/head.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/reveal.js#configuration
Reveal.initialize({
// The "normal" size of the presentation, aspect ratio will be preserved
// when the presentation is scaled to fit different resolutions. Can be
// specified using percentage units.
width: 1280,
height: 1024,
// Factor of the display size that should remain empty around the content
margin: 0.1618,
// Display a presentation progress bar
progress: true,
slideNumber: 'c/t',
// Push each slide change to the browser history
//history: false,
// Vertical centering of slides
center: false,
// Enables touch navigation on devices with touch input
touch: true,
// Bounds for smallest/largest possible scale to apply to content
minScale: 0.2,
maxScale: 2.5,
// Display controls in the bottom right corner
controls: false,
// Enable keyboard shortcuts for navigation
keyboard: true,
// Enable the slide overview mode
overview: true,
// Loop the presentation
//loop: false,
// Change the presentation direction to be RTL
//rtl: false,
// Number of milliseconds between automatically proceeding to the
// next slide, disabled when set to 0, this value can be overwritten
// by using a data-autoslide attribute on your slides
//autoSlide: 0,
// Enable slide navigation via mouse wheel
//mouseWheel: false,
// Parallax background image
//parallaxBackgroundImage: '/Users/laurentperrinet/cloud_nas/2015_RTC/2014-04-17_HDR/figures/p4100011.jpg', // e.g. "https://s3.amazonaws.com/hakim-static/reveal-js/reveal-parallax-1.jpg"
// Parallax background size
//parallaxBackgroundSize: '3200px 2000px', // CSS syntax, e.g. "2100px 900px" - currently only pixels are supported (don't use % or auto)
// This slide transition gives best results:
transition: 'fade', // default/cube/page/concave/zoom/linear/fade/none
// Transition speed
transitionSpeed: 'slow', // default/fast/slow
// Transition style for full page backgrounds
backgroundTransition: 'none', // default/linear/none
// Turns fragments on and off globally
fragments: true,
// Theme
theme: 'simple', // available themes are in /css/theme
math: {
mathjax: 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js',
config: 'TeX-AMS_HTML-full' // See http://docs.mathjax.org/en/latest/config-files.html
},
chalkboard: {
// optionally load pre-recorded chalkboard drawing from file
src: "chalkboard.json",
},
// Optional reveal.js plugins
dependencies: [
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/chalkboard/chalkboard.js' },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/zoom-js/zoom.js', async: true },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/notes/notes.js', async: true },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/math/math.js', async: true },
{ src: 'https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.7.0/plugin/mathsvg/math.js', async: true },
],
keyboard: {
67: function() {{ RevealChalkboard.toggleNotesCanvas() }}, // toggle notes canvas when 'c' is pressed
66: function() {{ RevealChalkboard.toggleChalkboard() }}, // toggle chalkboard when 'b' is pressed
46: function() {{ RevealChalkboard.clear() }}, // clear chalkboard when 'DEL' is pressed
8: function() {{ RevealChalkboard.reset() }}, // reset chalkboard data on current slide when 'BACKSPACE' is pressed
68: function() {{ RevealChalkboard.download() }}, // downlad recorded chalkboard drawing when 'd' is pressed
},
});
</script>
</body>
</html>