@inproceedings{laokulrat-etal-2016-generating,
title = "Generating Video Description using Sequence-to-sequence Model with Temporal Attention",
author = "Laokulrat, Natsuda and
Phan, Sang and
Nishida, Noriki and
Shu, Raphael and
Ehara, Yo and
Okazaki, Naoaki and
Miyao, Yusuke and
Nakayama, Hideki",
editor = "Matsumoto, Yuji and
Prasad, Rashmi",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/C16-1005",
pages = "44--52",
abstract = "Automatic video description generation has recently been getting attention after rapid advancement in image caption generation. Automatically generating description for a video is more challenging than for an image due to its temporal dynamics of frames. Most of the work relied on Recurrent Neural Network (RNN) and recently attentional mechanisms have also been applied to make the model learn to focus on some frames of the video while generating each word in a describing sentence. In this paper, we focus on a sequence-to-sequence approach with temporal attention mechanism. We analyze and compare the results from different attention model configuration. By applying the temporal attention mechanism to the system, we can achieve a METEOR score of 0.310 on Microsoft Video Description dataset, which outperformed the state-of-the-art system so far.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="laokulrat-etal-2016-generating">
<titleInfo>
<title>Generating Video Description using Sequence-to-sequence Model with Temporal Attention</title>
</titleInfo>
<name type="personal">
<namePart type="given">Natsuda</namePart>
<namePart type="family">Laokulrat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sang</namePart>
<namePart type="family">Phan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noriki</namePart>
<namePart type="family">Nishida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yo</namePart>
<namePart type="family">Ehara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Miyao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hideki</namePart>
<namePart type="family">Nakayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuji</namePart>
<namePart type="family">Matsumoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rashmi</namePart>
<namePart type="family">Prasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic video description generation has recently been getting attention after rapid advancement in image caption generation. Automatically generating description for a video is more challenging than for an image due to its temporal dynamics of frames. Most of the work relied on Recurrent Neural Network (RNN) and recently attentional mechanisms have also been applied to make the model learn to focus on some frames of the video while generating each word in a describing sentence. In this paper, we focus on a sequence-to-sequence approach with temporal attention mechanism. We analyze and compare the results from different attention model configuration. By applying the temporal attention mechanism to the system, we can achieve a METEOR score of 0.310 on Microsoft Video Description dataset, which outperformed the state-of-the-art system so far.</abstract>
<identifier type="citekey">laokulrat-etal-2016-generating</identifier>
<location>
<url>https://aclanthology.org/C16-1005</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>44</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Video Description using Sequence-to-sequence Model with Temporal Attention
%A Laokulrat, Natsuda
%A Phan, Sang
%A Nishida, Noriki
%A Shu, Raphael
%A Ehara, Yo
%A Okazaki, Naoaki
%A Miyao, Yusuke
%A Nakayama, Hideki
%Y Matsumoto, Yuji
%Y Prasad, Rashmi
%S Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F laokulrat-etal-2016-generating
%X Automatic video description generation has recently been getting attention after rapid advancement in image caption generation. Automatically generating description for a video is more challenging than for an image due to its temporal dynamics of frames. Most of the work relied on Recurrent Neural Network (RNN) and recently attentional mechanisms have also been applied to make the model learn to focus on some frames of the video while generating each word in a describing sentence. In this paper, we focus on a sequence-to-sequence approach with temporal attention mechanism. We analyze and compare the results from different attention model configuration. By applying the temporal attention mechanism to the system, we can achieve a METEOR score of 0.310 on Microsoft Video Description dataset, which outperformed the state-of-the-art system so far.
%U https://aclanthology.org/C16-1005
%P 44-52
Markdown (Informal)
[Generating Video Description using Sequence-to-sequence Model with Temporal Attention](https://aclanthology.org/C16-1005) (Laokulrat et al., COLING 2016)
ACL
- Natsuda Laokulrat, Sang Phan, Noriki Nishida, Raphael Shu, Yo Ehara, Naoaki Okazaki, Yusuke Miyao, and Hideki Nakayama. 2016. Generating Video Description using Sequence-to-sequence Model with Temporal Attention. In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers, pages 44–52, Osaka, Japan. The COLING 2016 Organizing Committee.