vqa_challenge_2017_leaderboard.json

[{"team_members": "N/A", "challenge": {"overall": 65.18, "perAnswerType": {"other": 55.91, "number": 41.68, "yes/no": 82.08}}, "dev": {"overall": 64.96, "perAnswerType": {"other": 55.78, "number": 41.38, "yes/no": 82.03}}, "standard": {"overall": 65.3, "perAnswerType": {"other": 55.86, "number": 41.99, "yes/no": 82.27}}, "team_name_order": 1, "team_name": "Adelaide-Teney", "ref": "", "method": "TBA"}, {"team_members": "Damien Teney (University of Adelaide), Anton van den Hengel (University of Adelaide), Peter Anderson* (Australian National University), David Golub* (Stanford University), Po-Sen Huang (Microsoft Research), Lei Zhang (Microsoft Research), Xiaodong He (Microsoft Research) (* Work performed while interning at MSR)", "challenge": {"overall": 69.0, "perAnswerType": {"other": 59.95, "number": 47.35, "yes/no": 85.18}}, "dev": {"overall": 69.0, "perAnswerType": {"other": 59.88, "number": 48.19, "yes/no": 85.24}}, "standard": {"overall": 69.13, "perAnswerType": {"other": 59.82, "number": 47.45, "yes/no": 85.54}}, "team_name_order": 2, "team_name": "Adelaide-Teney ACRV MSR", "ref": "https://arxiv.org/abs/1708.02711", "method": "A simple neural network with a few tricks and image features generated with bottom-up attention."}, {"team_members": "Zhou Su* (Intel Labs China), Chen Zhu* (Shanghai Tech University), Dongqi Cai (Intel Labs China), Jianguo Li (Intel Labs China) (* indicates equal contributions)", "challenge": {"overall": 66.68, "perAnswerType": {"other": 58.04, "number": 43.32, "yes/no": 82.83}}, "dev": {"overall": 66.48, "perAnswerType": {"other": 57.97, "number": 43.73, "yes/no": 82.56}}, "standard": {"overall": 66.67, "perAnswerType": {"other": 57.95, "number": 43.17, "yes/no": 82.88}}, "team_name_order": 3, "team_name": "Athena", "ref": "", "method": "We invented a novel method which combines structured visual attention, knowledge graph, memory network together for accurate VQA. The final submission is very simple snapshot-based ensemble result."}, {"team_members": "Gera Versfeld (Center for Research in Computer Vision, University of Central Florida), Yifan Ding (Center for Research in Computer Vision, University of Central Florida), Boqing Gong (Center for Research in Computer Vision, University of Central Florida)", "challenge": {"overall": 60.72, "perAnswerType": {"other": 54.75, "number": 37.22, "yes/no": 73.88}}, "dev": {"overall": 60.65, "perAnswerType": {"other": 54.85, "number": 36.82, "yes/no": 73.91}}, "standard": {"overall": 60.81, "perAnswerType": {"other": 54.84, "number": 36.43, "yes/no": 74.08}}, "team_name_order": 4, "team_name": "CRCV_REU", "ref": "", "method": "Using the pre-trained MCB model developed by Fukui et al. as a baseline, we fine-tuned their model on the VQA-v2 dataset. We also added a part of speech based weighting system to the question word embeddings."}, {"team_members": "Yuetan Lin (Zhejiang University), Zhangyang Pang (Zhejiang University), Donghui Wang (Zhejiang University)", "challenge": {"overall": 62.53, "perAnswerType": {"other": 53.14, "number": 38.31, "yes/no": 79.77}}, "dev": {"overall": 62.47, "perAnswerType": {"other": 53.08, "number": 38.72, "yes/no": 79.84}}, "standard": {"overall": 62.54, "perAnswerType": {"other": 52.95, "number": 38.64, "yes/no": 79.85}}, "team_name_order": 5, "team_name": "DCD_ZJU", "ref": "https://arxiv.org/abs/1702.06700", "method": "MLB with saliency score visual feature pre-processing and hadamard product attention."}, {"team_members": "YoungChul Sohn* (FCS Lab, Software R&D Center, SAMSUNG ELECTRONICS CO., LTD.), Kibeom Lee* (FCS Lab, Software R&D Center, SAMSUNG ELECTRONICS CO., LTD.), Jong-Ryul Lee* (FCS Lab, Software R&D Center, SAMSUNG ELECTRONICS CO., LTD.), Gyu-tae Park* (FCS Lab, Software R&D Center, SAMSUNG ELECTRONICS CO., LTD.) (* All equal contribution)", "challenge": {"overall": 68.07, "perAnswerType": {"other": 60.17, "number": 46.42, "yes/no": 82.92}}, "dev": {"overall": 67.95, "perAnswerType": {"other": 59.94, "number": 47.08, "yes/no": 82.94}}, "standard": {"overall": 68.22, "perAnswerType": {"other": 60.15, "number": 46.66, "yes/no": 83.17}}, "team_name_order": 6, "team_name": "DLAIT", "ref": "", "method": "Multimodal Attention Network with Attention Supervision. Data augmentation using Visual Genome + Region-based QA (generated from Visual Genome)."}, {"team_members": "Zhou Yu (Hangzhou Dianzi University, China), Jun Yu (Hangzhou Dianzi University, China), Chenchao Xiang (Hangzhou Dianzi University, China), Dalu Guo (The Unversity of Sydney,  Australia), Jianping Fan (University of North Carolina at Charlotte, USA), Dacheng Tao (The University of Sydney, Australia)", "challenge": {"overall": 68.16, "perAnswerType": {"other": 59.42, "number": 45.36, "yes/no": 84.28}}, "dev": {"overall": 68.02, "perAnswerType": {"other": 59.14, "number": 45.76, "yes/no": 84.39}}, "standard": {"overall": 68.09, "perAnswerType": {"other": 59.01, "number": 45.39, "yes/no": 84.5}}, "team_name_order": 7, "team_name": "HDU-USYD-UNCC", "ref": "", "method": "Our best entry is obtained by an ensemble of the proposed Multi-modal Factorized High-order (MFH) Pooling with Co-Attention Learning model. MFH is a high-order pooling model that outperforms the existing bilinear pooling models, and co-attention model capture the image attention and question attention simultaneously in an end-to-end neural network model. With an ensemble of 9 models, we achieve the overall accuracy 68.09 on the test-standard set."}, {"team_members": "Zhiwei Fang (Institute of Automation, Chinese Academy of Sciences), Zhenwei Shen (Shandong University of Science and Technology), Xinxin Zhu (Beijing University of Posts and Telecommunications), Longteng Guo (Institute of Automation, Chinese Academy of Sciences), Fei Liu (Institute of Automation, Chinese Academy of Sciences), Jun Fu (Institute of Automation, Chinese Academy of Sciences), Liu Jing (Institute of Automation, Chinese Academy of Sciences)", "challenge": {"overall": 65.64, "perAnswerType": {"other": 57.81, "number": 41.61, "yes/no": 81.05}}, "dev": {"overall": 65.56, "perAnswerType": {"other": 57.82, "number": 41.56, "yes/no": 81.07}}, "standard": {"overall": 65.7, "perAnswerType": {"other": 57.83, "number": 41.56, "yes/no": 81.09}}, "team_name_order": 8, "team_name": "JuneflowerIvaNlpr", "ref": "", "method": "We utilize caption annotation of MSCOCO and design a CNN as the encoder of captions and questions. The final score is the result of multi-model ensemble."}, {"team_members": "Ilija Ilievski (National University of Singapore), Jiashi Feng (National University of Singapore)", "challenge": {"overall": 67.62, "perAnswerType": {"other": 59.79, "number": 48.55, "yes/no": 81.71}}, "dev": {"overall": 67.71, "perAnswerType": {"other": 59.99, "number": 48.31, "yes/no": 81.95}}, "standard": {"overall": 67.64, "perAnswerType": {"other": 59.63, "number": 48.38, "yes/no": 81.92}}, "team_name_order": 9, "team_name": "LV_NUS", "ref": "https://github.com/ilija139/vqa-soft", "method": "An ensemble of models, each trained to learn different aspects of the problem. This is achieved by training some of the ensemble models with a new loss function, termed as 'Soft Cross Entropy' that more faithfully models the VQA evaluation metric, and by training some models only on subsets of the VQA dataset based on the question type (yes/no, number, and other)."}, {"team_members": "N/A", "challenge": {"overall": 49.16, "perAnswerType": {"other": 35.72, "number": 34.37, "yes/no": 68.49}}, "dev": {"overall": 49.32, "perAnswerType": {"other": 35.76, "number": 34.52, "yes/no": 69.02}}, "standard": {"overall": 49.56, "perAnswerType": {"other": 35.97, "number": 34.16, "yes/no": 69.22}}, "team_name_order": 10, "team_name": "MIC_TJ", "ref": "", "method": "online_Top1000_IMG_VGG19_D22S40"}, {"team_members": "N/A", "challenge": {"overall": 37.3, "perAnswerType": {"other": 16.7, "number": 29.66, "yes/no": 62.87}}, "dev": {"overall": 37.3, "perAnswerType": {"other": 16.59, "number": 29.73, "yes/no": 63.23}}, "standard": {"overall": 37.33, "perAnswerType": {"other": 16.68, "number": 29.97, "yes/no": 62.98}}, "team_name_order": 11, "team_name": "MultiLab", "ref": "", "method": "ensemble"}, {"team_members": "Hyeonwoo Noh (POSTECH), Jonghwan Moon (POSTECH), Woongi Chang (POSTECH), Bohyung Han (POSTECH)", "challenge": {"overall": 63.79, "perAnswerType": {"other": 55.59, "number": 41.26, "yes/no": 79.21}}, "dev": {"overall": 63.45, "perAnswerType": {"other": 55.35, "number": 40.9, "yes/no": 78.98}}, "standard": {"overall": 63.66, "perAnswerType": {"other": 55.3, "number": 40.67, "yes/no": 79.32}}, "team_name_order": 12, "team_name": "POSTECH", "ref": "", "method": "Gated neural module network ensembled with LSTM+CNN+Spatial Attention."}, {"team_members": "N/A", "challenge": {"overall": 43.28, "perAnswerType": {"other": 25.62, "number": 31.32, "yes/no": 66.67}}, "dev": {"overall": 43.3, "perAnswerType": {"other": 25.46, "number": 31.54, "yes/no": 67.1}}, "standard": {"overall": 43.48, "perAnswerType": {"other": 25.81, "number": 31.38, "yes/no": 66.97}}, "team_name_order": 13, "team_name": "UPC", "ref": "", "method": "Image features are obtained using a pretrained VGG in imagenet dataset. Textual representation is obtained using GloVe word embeddings and a LSTM layer to embed the sentence. Both representations are combined using an element-wise product."}, {"team_members": "Hedi Ben-younes* (Sorbonne University, UPMC, CNRS, LIP6 and Heuritech company), Remi Cadene* (Sorbonne University, UPMC, CNRS, LIP6), Matthieu Cord (Sorbonne University, UPMC, CNRS, LIP6), Nicolas Thome (CNAM) (* Equal contribution (order randomly chosen))", "challenge": {"overall": 65.67, "perAnswerType": {"other": 57.27, "number": 40.72, "yes/no": 81.97}}, "dev": {"overall": 65.57, "perAnswerType": {"other": 57.07, "number": 41.62, "yes/no": 81.96}}, "standard": {"overall": 65.71, "perAnswerType": {"other": 57.12, "number": 41.06, "yes/no": 82.07}}, "team_name_order": 14, "team_name": "UPMC-LIP6", "ref": "https://arxiv.org/pdf/1705.06676.pdf", "method": "Our method is based of the MUTAN architecture, which is a trainable fusion operator that models fine and rich interaction between image and text modalities. Our final submission is a bagging of several models with different kinds of fusion operators. Some use an attention strategy. Some others have their convolutional neural network fine tuned."}, {"team_members": "Peng Wang (Northwestern Polytechnical University, China), Qi Wu (Australia Centre for Visual Technologies, University of Adelaide)", "challenge": {"overall": 63.1, "perAnswerType": {"other": 53.67, "number": 40.86, "yes/no": 79.84}}, "dev": {"overall": 62.62, "perAnswerType": {"other": 53.24, "number": 40.95, "yes/no": 79.4}}, "standard": {"overall": 62.97, "perAnswerType": {"other": 53.35, "number": 40.91, "yes/no": 79.82}}, "team_name_order": 15, "team_name": "VQAMachine", "ref": "http://openaccess.thecvf.com/content_cvpr_2017/papers/Wang_The_VQA-Machine_Learning_CVPR_2017_paper.pdf", "method": "We propose a new co-attention model which can use multiple existing vision algorithms. Our model not only answer questions but also generates human-readable reasons for its decision."}, {"team_members": "N/A", "challenge": {"overall": 63.14, "perAnswerType": {"other": 57.3, "number": 38.52, "yes/no": 76.45}}, "dev": {"overall": 63.37, "perAnswerType": {"other": 57.59, "number": 39.4, "yes/no": 76.63}}, "standard": {"overall": 63.31, "perAnswerType": {"other": 57.31, "number": 39.29, "yes/no": 76.52}}, "team_name_order": 16, "team_name": "anon_team", "ref": "", "method": "Under review"}, {"team_members": "N/A", "challenge": {"overall": 64.79, "perAnswerType": {"other": 55.12, "number": 45.22, "yes/no": 81.09}}, "dev": {"overall": 64.78, "perAnswerType": {"other": 55.3, "number": 44.78, "yes/no": 81.22}}, "standard": {"overall": 65.05, "perAnswerType": {"other": 55.3, "number": 45.11, "yes/no": 81.5}}, "team_name_order": 17, "team_name": "coral2017", "ref": "", "method": "ensemble"}, {"team_members": "N/A", "challenge": {"overall": 65.79, "perAnswerType": {"other": 57.0, "number": 42.89, "yes/no": 82.0}}, "dev": {"overall": 65.76, "perAnswerType": {"other": 57.07, "number": 43.64, "yes/no": 81.85}}, "standard": {"overall": 65.84, "perAnswerType": {"other": 56.71, "number": 43.06, "yes/no": 82.32}}, "team_name_order": 18, "team_name": "lonely_shepherd", "ref": "", "method": "Structured Attention"}, {"team_members": "N/A", "challenge": {"overall": 55.37, "perAnswerType": {"other": 47.36, "number": 35.53, "yes/no": 69.86}}, "dev": {"overall": 55.35, "perAnswerType": {"other": 47.32, "number": 35.39, "yes/no": 70.1}}, "standard": {"overall": 55.28, "perAnswerType": {"other": 47.18, "number": 35.65, "yes/no": 69.77}}, "team_name_order": 19, "team_name": "neural-vqa-attention", "ref": "https://arxiv.org/abs/1511.02274", "method": "Stacked Attention Network, Yang et al., CVPR 2016. neural-vqa-attention: https://github.com/abhshkdz/neural-vqa-attention"}, {"team_members": "N/A", "challenge": {"overall": 64.67, "perAnswerType": {"other": 56.0, "number": 40.38, "yes/no": 81.11}}, "dev": {"overall": 64.47, "perAnswerType": {"other": 55.68, "number": 41.24, "yes/no": 81.0}}, "standard": {"overall": 64.79, "perAnswerType": {"other": 55.82, "number": 40.74, "yes/no": 81.41}}, "team_name_order": 20, "team_name": "usyd_zju", "ref": "", "method": "model_4"}, {"team_members": "Manoj Acharya (RIT)", "challenge": {"overall": 62.76, "perAnswerType": {"other": 53.39, "number": 38.69, "yes/no": 79.95}}, "dev": {"overall": 62.59, "perAnswerType": {"other": 53.3, "number": 39.09, "yes/no": 79.78}}, "standard": {"overall": 62.89, "perAnswerType": {"other": 53.58, "number": 38.95, "yes/no": 79.88}}, "team_name_order": 21, "team_name": "vqa_hack3r", "ref": "", "method": "We train our system using state-of-the-art VQA algorithms."}, {"team_members": "Ahmed Osman (Fraunhofer Heinrich Hertz Institute & University of Freiburg), Wojciech Samek (Fraunhofer Heinrich Hertz Institute)", "challenge": {"overall": 62.39, "perAnswerType": {"other": 53.53, "number": 39.77, "yes/no": 78.61}}, "dev": {"overall": 62.24, "perAnswerType": {"other": 53.58, "number": 40.31, "yes/no": 78.27}}, "standard": {"overall": 62.66, "perAnswerType": {"other": 53.76, "number": 39.91, "yes/no": 78.86}}, "team_name_order": 22, "team_name": "vqahhi_drau", "ref": "", "method": "We propose propose a model with Dual Recurrent Attention Units (DRAU), which utilizes recurrent layers to generate textual and visual attention. The memory characteristic of recurrent layers improves performance in relational and sequential reasoning tasks such as counting. Our proposed attention mechanism can be integrated in most VQA models fairly easily."}, {"team_members": "Yash Goyal (Virginia Tech), Tejas Khot (Virginia Tech), Douglas Summers-Stay (ARL), Dhruv Batra (Georgia Tech), Devi Parikh (Georgia Tech)", "challenge": {"overall": 54.08, "perAnswerType": {"other": 41.91, "number": 35.52, "yes/no": 72.99}}, "dev": {"overall": 54.02, "perAnswerType": {"other": 41.93, "number": 35.43, "yes/no": 73.08}}, "standard": {"overall": 54.22, "perAnswerType": {"other": 41.83, "number": 35.18, "yes/no": 73.46}}, "team_name_order": 23, "team_name": "vqateam_deeper_LSTM_Q_norm_I", "ref": "https://arxiv.org/abs/1505.00468", "method": "Baseline VQA model from Antol et al., ICCV 2015. 2-channel (image and question) model. Question channel (LSTM with 2 hidden layers) provides question representation and the image channel (activations from last hidden layer of VGGNet) provides image representation. The image features thus obtained are l2 normalized. Question and image features are pointwise multiplied and fed to fully connected layer to obtain softmax distribution over 1000 answers."}, {"team_members": "Yash Goyal (Virginia Tech), Tejas Khot (Virginia Tech), Douglas Summers-Stay (ARL), Dhruv Batra (Georgia Tech), Devi Parikh (Georgia Tech)", "challenge": {"overall": 44.34, "perAnswerType": {"other": 27.64, "number": 31.75, "yes/no": 66.79}}, "dev": {"overall": 44.22, "perAnswerType": {"other": 27.36, "number": 31.41, "yes/no": 67.17}}, "standard": {"overall": 44.26, "perAnswerType": {"other": 27.37, "number": 31.55, "yes/no": 67.01}}, "team_name_order": 24, "team_name": "vqateam_language_only", "ref": "", "method": "Similar model architecture as Deeper LSTM Question + Normalized Image model from Antol et al., ICCV 2015 but without the image channel. Question is passed through an LSTM with 2 hidden layers to provide question representation, which is fed to fully connected layer to obtain softmax distribution over 1000 answers."}, {"team_members": "Yash Goyal (Virginia Tech), Tejas Khot (Virginia Tech), Douglas Summers-Stay (ARL), Dhruv Batra (Georgia Tech), Devi Parikh (Georgia Tech)", "challenge": {"overall": 62.33, "perAnswerType": {"other": 53.47, "number": 38.52, "yes/no": 78.85}}, "dev": {"overall": 61.96, "perAnswerType": {"other": 53.23, "number": 38.81, "yes/no": 78.41}}, "standard": {"overall": 62.27, "perAnswerType": {"other": 53.36, "number": 38.28, "yes/no": 78.82}}, "team_name_order": 25, "team_name": "vqateam_mcb_benchmark", "ref": "https://arxiv.org/abs/1606.01847", "method": "'MCB + Att.' model (row 3, Table 4) from Fukui et al., EMNLP 2016. This model is trained only on VQA v2.0 train+val set (without using Visual Genome data) and without using pretrained Glove embeddings."}, {"team_members": "Yash Goyal (Virginia Tech), Tejas Khot (Virginia Tech), Douglas Summers-Stay (ARL), Dhruv Batra (Georgia Tech), Devi Parikh (Georgia Tech)", "challenge": {"overall": 25.98, "perAnswerType": {"other": 1.13, "number": 0.34, "yes/no": 61.26}}, "dev": {"overall": 25.7, "perAnswerType": {"other": 1.11, "number": 0.33, "yes/no": 61.03}}, "standard": {"overall": 25.98, "perAnswerType": {"other": 1.17, "number": 0.36, "yes/no": 61.2}}, "team_name_order": 26, "team_name": "vqateam_prior", "ref": "", "method": "'yes' (prior) is predicted as the answer for all questions"}, {"team_members": "N/A", "challenge": {"overall": 63.62, "perAnswerType": {"other": 55.22, "number": 40.28, "yes/no": 79.49}}, "dev": {"overall": 63.36, "perAnswerType": {"other": 54.92, "number": 40.51, "yes/no": 79.37}}, "standard": {"overall": 63.57, "perAnswerType": {"other": 54.75, "number": 40.53, "yes/no": 79.77}}, "team_name_order": 27, "team_name": "yahia zakaria", "ref": "", "method": "Stacked Attention Network with Tree Sequence Encoder (2 Model Ensemble)"}, {"team_members": "N/A", "challenge": {"overall": 65.23, "perAnswerType": {"other": 55.3, "number": 45.66, "yes/no": 81.84}}, "dev": {"overall": 65.17, "perAnswerType": {"other": 55.46, "number": 45.6, "yes/no": 81.75}}, "standard": {"overall": 65.41, "perAnswerType": {"other": 55.43, "number": 45.56, "yes/no": 82.1}}, "team_name_order": 28, "team_name": "yudf2001", "ref": "", "method": "ensemble-results"}, {"date": "2017-10-22"}]