Publications

Google Scholar · DBLP · Semantic Scholar

Thamme Gowda and Roman Grundkiewicz and Elijah Rippeth and Matt Post and Marcin Junczys-Dowmunt. "PyMarian: Fast Neural Machine Translation and Evaluation in Python". 2024. [Link] [BibTeX]

@misc{gowda2024pymarianfastneuralmachine,
title={PyMarian: Fast Neural Machine Translation and Evaluation in Python}, 
      author={Thamme Gowda and Roman Grundkiewicz and Elijah Rippeth and Matt Post and Marcin Junczys-Dowmunt},
      year={2024},
      eprint={2408.11853},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2408.11853},
}

Cho, Hyundong and Gowda, Thamme and Huang, Yuyang and Lu, Zixun and Tong, Tianli and May, Jonathan. "BotEval: Facilitating Interactive Human Evaluation". Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations). 2024. [Link] [BibTeX]

@inproceedings{cho-etal-2024-boteval,
title = "{B}ot{E}val: Facilitating Interactive Human Evaluation",
    author = "Cho, Hyundong  and
      Gowda, Thamme  and
      Huang, Yuyang  and
      Lu, Zixun  and
      Tong, Tianli  and
      May, Jonathan",
    editor = "Cao, Yixin  and
      Feng, Yang  and
      Xiong, Deyi",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-demos.11",
    pages = "107--116",
}

Kocmi, Tom and Avramidis, Eleftherios and Bawden, Rachel and Bojar, Ondřej and Dvorkovich, Anton and Federmann, Christian and Fishel, Mark and Freitag, Markus and Gowda, Thamme and Grundkiewicz, Roman and Haddow, Barry and Koehn, Philipp and Marie, Benjamin and Monz, Christof and Morishita, Makoto and Murray, Kenton and Nagata, Makoto and Nakazawa, Toshiaki and Popel, Martin and Popović, Maja and Shmatova, Mariya. "Findings of the 2023 Conference on Machine Translation (WMT23): LLMs Are Here but Not Quite There Yet". Proceedings of the Eighth Conference on Machine Translation. 2023. [Link] [DOI] [BibTeX]

@inproceedings{kocmi-etal-2023-findings,
title = "Findings of the 2023 Conference on Machine Translation ({WMT}23): {LLM}s Are Here but Not Quite There Yet",
    author = "Kocmi, Tom  and
      Avramidis, Eleftherios  and
      Bawden, Rachel  and
      Bojar, Ond{\v{r}}ej  and
      Dvorkovich, Anton  and
      Federmann, Christian  and
      Fishel, Mark  and
      Freitag, Markus  and
      Gowda, Thamme  and
      Grundkiewicz, Roman  and
      Haddow, Barry  and
      Koehn, Philipp  and
      Marie, Benjamin  and
      Monz, Christof  and
      Morishita, Makoto  and
      Murray, Kenton  and
      Nagata, Makoto  and
      Nakazawa, Toshiaki  and
      Popel, Martin  and
      Popovi{\'c}, Maja  and
      Shmatova, Mariya",
    editor = "Koehn, Philipp  and
      Haddow, Barry  and
      Kocmi, Tom  and
      Monz, Christof",
    booktitle = "Proceedings of the Eighth Conference on Machine Translation",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.wmt-1.1",
    doi = "10.18653/v1/2023.wmt-1.1",
    pages = "1--42",
}

Post, Matt and Gowda, Thamme and Grundkiewicz, Roman and Khayrallah, Huda and Jain, Rohit and Junczys-Dowmunt, Marcin. "SOTASTREAM: A Streaming Approach to Machine Translation Training". Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023). 2023. [Link] [DOI] [BibTeX]

@inproceedings{post-etal-2023-sotastream,
title = "{SOTASTREAM}: A Streaming Approach to Machine Translation Training",
    author = "Post, Matt  and
      Gowda, Thamme  and
      Grundkiewicz, Roman  and
      Khayrallah, Huda  and
      Jain, Rohit  and
      Junczys-Dowmunt, Marcin",
    editor = "Tan, Liling  and
      Milajevs, Dmitrijs  and
      Chauhan, Geeticka  and
      Gwinnup, Jeremy  and
      Rippeth, Elijah",
    booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.nlposs-1.13",
    doi = "10.18653/v1/2023.nlposs-1.13",
    pages = "110--119",
}

Kocmi, Tom and Bawden, Rachel and Bojar, Ondrej and Dvorkovich, Anton and Federmann, Christian and Fishel, Mark and Gowda, Thamme and Graham, Yvette and Grundkiewicz, Roman and Haddow, Barry and Knowles, Rebecca and Koehn, Philipp and Monz, Christof and Morishita, Makoto and Nagata, Masaaki and Nakazawa, Toshiaki and NovÃ¡k, Michal and Popel, Martin and PopoviÄ‡, Maja and Shmatova, Mariya. "Findings of the 2022 Conference on Machine Translation (WMT22)". Proceedings of the Seventh Conference on Machine Translation. 2022. [Link] [BibTeX]

@inproceedings{kocmi-EtAl:2022:WMT,
author    = {Kocmi, Tom  and  Bawden, Rachel  and  Bojar, Ondrej  and  Dvorkovich, Anton  and  Federmann, Christian  and  Fishel, Mark  and  Gowda, Thamme  and  Graham, Yvette  and  Grundkiewicz, Roman  and  Haddow, Barry  and  Knowles, Rebecca  and  Koehn, Philipp  and  Monz, Christof  and  Morishita, Makoto  and  Nagata, Masaaki  and  Nakazawa, Toshiaki  and  NovÃ¡k, Michal  and  Popel, Martin  and  PopoviÄ‡, Maja  and  Shmatova, Mariya},
  title     = {Findings of the 2022 Conference on Machine Translation (WMT22)},
  booktitle      = {Proceedings of the Seventh Conference on Machine Translation},
  month          = {December},
  year           = {2022},
  address        = {Abu Dhabi},
  publisher      = {Association for Computational Linguistics},
  pages     = {1--45},
  url       = {https://aclanthology.org/2022.wmt-1.1}
}

Thamme Gowda. "The inevitable problem of rare phenomena learning in machine translation". 2022. [BibTeX]

@phdthesis{gowda-2022-diss,
author  = "Thamme Gowda",
  title   = "The inevitable problem of rare phenomena learning in machine translation",
  school  = "University of Southern California Graduate School",
  year    = "2022",
  address = "Los Angeles, California, USA",
  month   = "August"
}

Gowda, Thamme and Gheini, Mozhdeh and May, Jonathan. "Checks and Strategies for Enabling Code-Switched Machine Translation". arXiv preprint arXiv:2210.05096. 2022. [BibTeX]

@article{gowda2022checks,
title={Checks and Strategies for Enabling Code-Switched Machine Translation},
  author={Gowda, Thamme and Gheini, Mozhdeh and May, Jonathan},
  journal={arXiv preprint arXiv:2210.05096},
  year={2022}
}

Gowda, Thamme and Zhang, Zhao and Mattmann, Chris and May, Jonathan. "Many-to-English Machine Translation Tools, Data, and Pretrained Models". Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations. 2021. [Link] [DOI] [BibTeX]

@inproceedings{gowda-etal-2021-many,
title = "Many-to-{E}nglish Machine Translation Tools, Data, and Pretrained Models",
    author = "Gowda, Thamme  and
      Zhang, Zhao  and
      Mattmann, Chris  and
      May, Jonathan",
    booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.acl-demo.37",
    doi = "10.18653/v1/2021.acl-demo.37",
    pages = "306--316",
    abstract = "While there are more than 7000 languages in the world, most translation research efforts have targeted a few high resource languages. Commercial translation systems support only one hundred languages or fewer, and do not make these models available for transfer to low resource languages. In this work, we present useful tools for machine translation research: MTData, NLCodec and RTG. We demonstrate their usefulness by creating a multilingual neural machine translation model capable of translating from 500 source languages to English. We make this multilingual model readily downloadable and usable as a service, or as a parent model for transfer-learning to even lower-resource languages.",
}

Gowda, Thamme and You, Weiqiu and Lignos, Constantine and May, Jonathan. "Macro-Average: Rare Types Are Important Too". Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 2021. [Link] [DOI] [BibTeX]

@inproceedings{gowda-etal-2021-macro-average,
title = "Macro-Average: Rare Types Are Important Too",
    author = "Gowda, Thamme  and
      You, Weiqiu  and
      Lignos, Constantine  and
      May, Jonathan",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.90",
    doi = "10.18653/v1/2021.naacl-main.90",
    pages = "1138--1157",
    abstract = "While traditional corpus-level evaluation metrics for machine translation (MT) correlate well with fluency, they struggle to reflect adequacy. Model-based MT metrics trained on segment-level human judgments have emerged as an attractive replacement due to strong correlation results. These models, however, require potentially expensive re-training for new domains and languages. Furthermore, their decisions are inherently non-transparent and appear to reflect unwelcome biases. We explore the simple type-based classifier metric, MacroF1, and study its applicability to MT evaluation. We find that MacroF1 is competitive on direct assessment, and outperforms others in indicating downstream cross-lingual information retrieval task performance. Further, we show that MacroF1 can be used to effectively compare supervised and unsupervised neural machine translation, and reveal significant qualitative differences in the methods{'} outputs.",
}

Gowda, Thamme and May, Jonathan. "Finding the Optimal Vocabulary Size for Neural Machine Translation". Findings of the Association for Computational Linguistics: EMNLP 2020. 2020. [Link] [DOI] [BibTeX]

@inproceedings{gowda-may-2020-finding,
title = "Finding the Optimal Vocabulary Size for Neural Machine Translation",
    author = "Gowda, Thamme  and
      May, Jonathan",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.findings-emnlp.352",
    doi = "10.18653/v1/2020.findings-emnlp.352",
    pages = "3955--3964",
    abstract = "We cast neural machine translation (NMT) as a classification task in an autoregressive setting and analyze the limitations of both classification and autoregression components. Classifiers are known to perform better with balanced class distributions during training. Since the Zipfian nature of languages causes imbalanced classes, we explore its effect on NMT. We analyze the effect of various vocabulary sizes on NMT performance on multiple languages with many data sizes, and reveal an explanation for why certain vocabulary sizes are better than others.",
    ISIArea = {Natural}
}

Mehrabi, Ninareh and Gowda, Thamme and Morstatter, Fred and Peng, Nanyun and Galstyan, Aram. "Man is to Person as Woman is to Location: Measuring Gender Bias in Named Entity Recognition". Proceedings of the 31st ACM Conference on Hypertext and Social Media. 2020. [Link] [DOI] [BibTeX]

@inproceedings{mehrabi2020NERbias,
author = {Mehrabi, Ninareh and Gowda, Thamme and Morstatter, Fred and Peng, Nanyun and Galstyan, Aram},
title = {Man is to Person as Woman is to Location: Measuring Gender Bias in Named Entity Recognition},
year = {2020},
isbn = {9781450370981},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3372923.3404804},
doi = {10.1145/3372923.3404804},
abstract = {In this paper, we study the bias in named entity recognition (NER) models---specifically, the difference in the ability to recognize male and female names as PERSON entity types. We evaluate NER models on a dataset containing 139 years of U.S. census baby names and find that relatively more female names, as opposed to male names, are not recognized as PERSON entities. The result of this analysis yields a new benchmark for gender bias evaluation in named entity recognition systems. The data and code for the application of this benchmark is publicly available for researchers to use.},
booktitle = {Proceedings of the 31st ACM Conference on Hypertext and Social Media},
pages = {231–232},
numpages = {2},
keywords = {algorithmic fairness, named entity recognition, evaluation, natural language processing},
location = {Virtual Event, USA},
series = {HT '20},
ISIArea = {Natural and Machine}
}

Pan, Xiaoman and Gowda, Thamme and Ji, Heng and May, Jonathan and Miller, Scott. "Cross-lingual Joint Entity and Word Embedding to Improve Entity Linking and Parallel Sentence Mining". Proceedings of the 2nd Workshop on Deep Learning Approaches for Low-Resource NLP (DeepLo 2019). 2019. [Link] [DOI] [BibTeX]

@inproceedings{pan-etal-2019-cross,
title = "Cross-lingual Joint Entity and Word Embedding to Improve Entity Linking and Parallel Sentence Mining",
    author = "Pan, Xiaoman  and
      Gowda, Thamme  and
      Ji, Heng  and
      May, Jonathan  and
      Miller, Scott",
    booktitle = "Proceedings of the 2nd Workshop on Deep Learning Approaches for Low-Resource NLP (DeepLo 2019)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-6107",
    doi = "10.18653/v1/D19-6107",
    pages = "56--66",
    abstract = "Entities, which refer to distinct objects in the real world, can be viewed as language universals and used as effective signals to generate less ambiguous semantic representations and align multiple languages. We propose a novel method, CLEW, to generate cross-lingual data that is a mix of entities and contextual words based on Wikipedia. We replace each anchor link in the source language with its corresponding entity title in the target language if it exists, or in the source language otherwise. A cross-lingual joint entity and word embedding learned from this kind of data not only can disambiguate linkable entities but can also effectively represent unlinkable entities. Because this multilingual common space directly relates the semantics of contextual words in the source language to that of entities in the target language, we leverage it for unsupervised cross-lingual entity linking. Experimental results show that CLEW significantly advances the state-of-the-art: up to 3.1{\%} absolute F-score gain for unsupervised cross-lingual entity linking. Moreover, it provides reliable alignment on both the word/entity level and the sentence level, and thus we use it to mine parallel sentences for all (302, 2) language pairs in Wikipedia.",
    ISIArea = {Natural}
}

Boschee, Elizabeth and Barry, Joel and Billa, Jayadev and Freedman, Marjorie and Gowda, Thamme and Lignos, Constantine and Palen-Michel, Chester and Pust, Michael and Khonglah, Banriskhem Kayang and Madikeri, Srikanth and May, Jonathan and Miller, Scott. "SARAL: A Low-Resource Cross-Lingual Domain-Focused Information Retrieval System for Effective Rapid Document Triage". Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics: System Demonstrations. 2019. [Link] [DOI] [BibTeX]

@inproceedings{boschee-etal-2019-saral,
title = "{SARAL}: A Low-Resource Cross-Lingual Domain-Focused Information Retrieval System for Effective Rapid Document Triage",
    author = "Boschee, Elizabeth  and
      Barry, Joel  and
      Billa, Jayadev  and
      Freedman, Marjorie  and
      Gowda, Thamme  and
      Lignos, Constantine  and
      Palen-Michel, Chester  and
      Pust, Michael  and
      Khonglah, Banriskhem Kayang  and
      Madikeri, Srikanth  and
      May, Jonathan  and
      Miller, Scott",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P19-3004",
    doi = "10.18653/v1/P19-3004",
    pages = "19--24",
    abstract = "With the increasing democratization of electronic media, vast information resources are available in less-frequently-taught languages such as Swahili or Somali. That information, which may be crucially important and not available elsewhere, can be difficult for monolingual English speakers to effectively access. In this paper we present an end-to-end cross-lingual information retrieval (CLIR) and summarization system for low-resource languages that 1) enables English speakers to search foreign language repositories of text and audio using English queries, 2) summarizes the retrieved documents in English with respect to a particular information need, and 3) provides complete transcriptions and translations as needed. The SARAL system achieved the top end-to-end performance in the most recent IARPA MATERIAL CLIR+summarization evaluations. Our demonstration system provides end-to-end open query retrieval and summarization capability, and presents the original source text or audio, speech transcription, and machine translation, for two low resource languages.",
    ISIArea = {Natural}
}

Wagstaff, Kiri and Lu, You and Stanboli, Alice and Grimes, Kevin and Gowda, Thamme and Padams, Jordan. "Deep Mars: CNN Classification of Mars Imagery for the PDS Imaging Atlas". Proceedings of the AAAI Conference on Artificial Intelligence. 2018. [Link] [BibTeX]

@article{Wagstaff2018Deepmars,
title={Deep Mars: CNN Classification of Mars Imagery for the PDS Imaging Atlas}, volume={32},
author={Wagstaff, Kiri and Lu, You and Stanboli, Alice and Grimes, Kevin and Gowda, Thamme and Padams, Jordan},
url={https://ojs.aaai.org/index.php/AAAI/article/view/11404},
 number={1},
 journal={Proceedings of the AAAI Conference on Artificial Intelligence},
 year={2018}, month={Apr.},
 ISIArea = {Vision and Space}
}

Hundman, Kyle and Gowda, Thamme and Kejriwal, Mayank and Boecking, Benedikt. "Always Lurking: Understanding and Mitigating Bias in Online Human Trafficking Detection". Proceedings of the 2018 AAAI/ACM Conference on AI, Ethics, and Society. 2018. [Link] [DOI] [BibTeX]

@inproceedings{hundman2018lurking,
author = {Hundman, Kyle and Gowda, Thamme and Kejriwal, Mayank and Boecking, Benedikt},
title = {Always Lurking: Understanding and Mitigating Bias in Online Human Trafficking Detection},
year = {2018},
isbn = {9781450360128},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3278721.3278782},
doi = {10.1145/3278721.3278782},
booktitle = {Proceedings of the 2018 AAAI/ACM Conference on AI, Ethics, and Society},
pages = {137–143},
numpages = {7},
keywords = {bias mitigation, clustering, web crawling, human trafficking, text classification},
location = {New Orleans, LA, USA},
series = {AIES '18},
  ISIArea = {Machine and Cybersecurity and Natural}
}

Wagstaff, Kiri and Francis, Raymond and Gowda, Thamme and Lu, You and Riloff, Ellen and Singh, Karanjeet and Lanza, Nina. "Mars Target Encyclopedia: Rock and Soil Composition Extracted From the Literature". Proceedings of the AAAI Conference on Artificial Intelligence. 2018. [Link] [BibTeX]

@article{wagstaff2018MTE,
title={Mars Target Encyclopedia: Rock and Soil Composition Extracted From the Literature},
author={Wagstaff, Kiri and Francis, Raymond and Gowda, Thamme and Lu, You and Riloff, Ellen and Singh, Karanjeet and Lanza, Nina},
volume={32},
url={https://ojs.aaai.org/index.php/AAAI/article/view/11412},
number={1},
journal={Proceedings of the AAAI Conference on Artificial Intelligence},
year={2018}, month={Apr.},
  ISIArea = {Space and Machine and Natural}
}

Gowda, Thamme and Hundman, Kyle and Mattmann, Chris A.. "An Approach for Automatic and Large Scale Image Forensics". Proceedings of the 2nd International Workshop on Multimedia Forensics and Security. 2017. [Link] [DOI] [BibTeX]

@inproceedings{forensics2017,
author = {Gowda, Thamme and Hundman, Kyle and Mattmann, Chris A.},
title = {An Approach for Automatic and Large Scale Image Forensics},
year = {2017},
isbn = {9781450350341},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3078897.3080536},
doi = {10.1145/3078897.3080536},
abstract = {This paper describes the applications of deep learning-based image recognition in the DARPA Memex program and its repository of 1.4 million weapons-related images collected from the Deep web. We develop a fast, efficient, and easily deployable framework for integrating Google's Tensorflow framework with Apache Tika for automatically performing image forensics on the Memex data. Our framework and its integration are evaluated qualitatively and quantitatively and our work suggests that automated, large-scale, and reliable image classification and forensics can be widely used and deployed in bulk analysis for answering domain-specific questions.},
booktitle = {Proceedings of the 2nd International Workshop on Multimedia Forensics and Security},
pages = {16–20},
numpages = {5},
keywords = {information retrieval, image recognition, multimedia forensics},
location = {Bucharest, Romania},
series = {MFSec '17},
  ISIArea = {Machine and Vision}
}

Gowda, Thamme and Mattmann, Chris A.. "Clustering Web Pages Based on Structure and Style Similarity (Application Paper)". 2016 IEEE 17th International Conference on Information Reuse and Integration (IRI). 2016. [DOI] [BibTeX]

@inproceedings{clustering2016,
author={Gowda, Thamme and Mattmann, Chris A.},
  booktitle={2016 IEEE 17th International Conference on Information Reuse and Integration (IRI)},
  title={Clustering Web Pages Based on Structure and Style Similarity (Application Paper)},
  year={2016},
  volume={},
  number={},
  pages={175-180},
  doi={10.1109/IRI.2016.30},
  ISIArea = {Machine}
}