@inproceedings{guzman:2013:amara,
abstract = {In this paper, we introduce a new parallel corpus of subtitles of educational videos: the AMARA corpus for online educational content. We crawl a multilingual collection community generated subtitles, and present the results of processing the Arabic–English portion of the data, which yields a parallel corpus of about 2.6M Arabic and 3.9M English words. We explore different approaches to align the segments, and extrinsically evaluate the resulting parallel corpus on the standard TED-talks tst-2010. We observe that the data can be successfully used for this task, and also observe an absolute improvement of 1.6 BLEU when it is used in combination with TED data. Finally, we analyze some of the specific challenges when translating the educational content.},
address = {Heidelberg, Germany},
author = {Guzm{\'a}n, Francisco and Sajjad, Hassan and Abdelali, Ahmed and Vogel, Stephan},
booktitle = {Proceedings of the 10th International Workshop on Spoken Language Translation {(IWSLT'13})},
month = {December},
title = {The {AMARA} Corpus: Building Resources for Translating the Web's Educational Content},
volume = {13},
year = {2013}
}