Publications

This page lists selected papers by lab members - see also a complete list of Amir's publications

(), """.". . (), . In: . (.) : , . , .
[ link ] [ link ]
[DOI]

@inproceedings{levine-zeldes-2026-whats,
    title = "What{'}s in a Bridge?: A Descriptive, Multi-Genre Analysis of the {GUMB}ridge Corpus for Varieties of Bridging Anaphora",
    author = "Levine, Lauren  and
      Zeldes, Amir",
    editor = "Braud, Chlo{\'e}  and
      Hardmeier, Christian  and
      Ogrodniczuk, Maciej  and
      Loaiciga, Sharid  and
      Zeldes, Amir  and
      Nov{\'a}k, Michal  and
      Li, Chuyuan  and
      Strube, Michael  and
      Li, Junyi Jessy",
    booktitle = "Proceedings of the 2nd Joint Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences and Computational Models of Reference, Anaphora and Coreference ({CODI}-{CRAC} 2026)",
    month = jul,
    year = "2026",
    address = "San Diego, California, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2026.codi-1.7/",
    doi = "10.18653/v1/2026.codi-1.7",
    pages = "40--52",
    ISBN = "979-8-89176-400-2",
    abstract = "In this paper, we present a descriptive corpus analysis of bridging anaphora across 16 genres of English, leveraging the multi-genre GUMBridge corpus for varieties of bridging anaphora. We begin our investigation by examining the distribution of bridging instances by sub-varieties and across genres, finding that spoken genres have less bridging instances than written ones. We then investigate the linguistic environments of bridging anaphora and their corresponding associative antecedents in the underlying data of the corpus, examining both categorical features (entity type, part of speech, syntactic dependency relations) and numeric features (mention length, cluster size, salience, and distance between the bridging anaphor and antecedent). We find bridging anaphora have a tendency to be shorter and are more often definite, and bridging antecedents show a tendency to be more salient than other entities. Finally, we analyze how several of the numeric features of bridging environments vary by genre, finding consistent patterns across genres for observed trends in the environments of bridging anaphora and antecedents."
}
@inproceedings{zeldes-etal-2026-worth,
    title = "Not Worth Mentioning? A Pilot Study on Salient Proposition Annotation",
    author = "Zeldes, Amir  and
      Conhaim, Katherine  and
      Levine, Lauren",
    editor = "Liu, Yang Janet  and
      Gessler, Luke",
    booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
    month = jul,
    year = "2026",
    address = "San Diego, California, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2026.law-main.14/",
    doi = "10.18653/v1/2026.law-main.14",
    pages = "178--186",
    ISBN = "979-8-89176-404-0",
    abstract = "Despite a long tradition of work on extractive summarization, which by nature aims to recover the most important propositions in a text, little work has been done on operationalizing graded proposition salience in naturally occurring data. In this paper, we adopt graded summarization-based salience as a metric from previous work on Salient Entity Extraction (SEE) and adapt it to quantify proposition salience. We define the annotation task, apply it to a small multi-genre dataset, evaluate agreement and carry out a preliminary study of the relationship between our metric and notions of discourse unit centrality in discourse parsing following Rhetorical Structure Theory (RST)."
}
@inproceedings{levine-zeldes-2026-cracks,
    title = "Cracks in the Bridge{---}or A Bridge Too Far? Comparing Human and {LLM} Errors in the Annotation of Bridging Anaphora",
    author = "Levine, Lauren  and
      Zeldes, Amir",
    editor = "Liu, Yang Janet  and
      Gessler, Luke",
    booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
    month = jul,
    year = "2026",
    address = "San Diego, California, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2026.law-main.16/",
    doi = "10.18653/v1/2026.law-main.16",
    pages = "219--228",
    ISBN = "979-8-89176-404-0",
    abstract = "In this paper, we perform an error analysis on human and LLM annotation data from the recent GUMBridge corpus for varieties of bridging anaphora. We explore the distribution of precision and recall errors made by annotators and how that distribution correlates with bridging subtypes. We find that while LLMs perform substantially worse than human annotators, they are more balanced in their precision and recall scores than humans, whose performance strongly favors precision. With regard to subtypes, we find that comparison and meronomy relations are easier to reliably annotate than the more broadly construed entity relations for both human and LLM annotators, but that LLM errors are more distributed across subtypes than human errors. Analyzing these results, we provide insights for future annotation projects on bridging anaphora."
}
@inproceedings{levine-etal-2026-gumbridge,
  title = {GUMBridge: A Corpus for Varieties of Bridging Anaphora},
  author = {Levine, Lauren and Zeldes, Amir},
  booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation Conference (LREC 2026)},
  month = {May},
  year = {2026},
  pages = {6823--6837},
  address = {Palma, Mallorca, Spain},
  publisher = {European Language Resources Association (ELRA)},
  editor = {Piperidis, Stelios and Bel, Núria and van den Heuvel, Henk and Ide, Nancy and Krek, Simon and Toral, Antonio},
  doi = {10.63317/3sf73k63vuww},
  abstract = {Bridging is an anaphoric phenomenon where the referent of an entity in a discourse is dependent on a previous, non-identical entity for interpretation, such as in "There is a house. The door is red," where the door is specifically understood to be the door of the aforementioned house. While there are several existing resources in English for bridging anaphora, most are small, provide limited coverage of the phenomenon, and/or provide limited genre coverage. In this paper, we introduce GUMBridge, a new resource for bridging, which includes 24 diverse genres of English, providing both broad coverage for the phenomenon, and granular annotations for the multi-subtype categorization of bridging varieties. We also present an evaluation of annotation quality and report on baseline performance using open and closed source contemporary LLMs on three tasks underlying our data, showing that bridging resolution and subtype classification remain difficult NLP tasks in the age of LLMs.}
}
@inproceedings{ju-etal-2025-dedisco,
    title = "{D}e{D}is{C}o at the {DISRPT} 2025 Shared Task: A System for Discourse Relation Classification",
    author = "Ju, Zhuoxuan  and
      Wu, Jingni  and
      Purushothama, Abhishek  and
      Zeldes, Amir",
    editor = "Braud, Chlo{\'e}  and
      Liu, Yang Janet  and
      Muller, Philippe  and
      Zeldes, Amir  and
      Li, Chuyuan",
    booktitle = "Proceedings of the 4th Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2025)",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.disrpt-1.4/",
    doi = "10.18653/v1/2025.disrpt-1.4",
    pages = "48--62",
    ISBN = "979-8-89176-344-9",
    abstract = "This paper presents DeDisCo, Georgetown University{'}s entry in the DISRPT 2025 shared task on discourse relation classification. We test two approaches, using an mt5-based encoder and a decoder based approach using the openly available Qwen model. We also experiment on training with augmented dataset for low-resource languages using matched data translated automatically from English, as well as using some additional linguistic features inspired by entries in previous editions of the Shared Task. Our system achieves a macro-accuracy score of 71.28, and we provide some interpretation and error analysis for our results."
}
@inproceedings{braud-etal-2025-disrpt,
    title = "The {DISRPT} 2025 Shared Task on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification",
    author = "Braud, Chlo{\'e}  and
      Zeldes, Amir  and
      Li, Chuyuan  and
      Liu, Yang Janet  and
      Muller, Philippe",
    editor = "Braud, Chlo{\'e}  and
      Liu, Yang Janet  and
      Muller, Philippe  and
      Zeldes, Amir  and
      Li, Chuyuan",
    booktitle = "Proceedings of the 4th Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2025)",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.disrpt-1.1/",
    doi = "10.18653/v1/2025.disrpt-1.1",
    pages = "1--20",
    ISBN = "979-8-89176-344-9",
    abstract = "In 2025, we held the fourth iteration of the DISRPT Shared Task (Discourse Relation Parsing and Treebanking) dedicated to discourse parsing across formalisms. Following the success of the 2019, 2021, and 2023 tasks on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification, this iteration added 13 new datasets, including three new languages (Czech, Polish, Nigerian Pidgin) and two new frameworks: the ISO framework and Enhanced Rhetorical Structure Theory, in addition to the previously included frameworks: RST, SDRT, DEP, and PDTB. In this paper, we review the data included in DISRPT 2025, which covers 39 datasets across 16 languages, survey and compare submitted systems, and report on system performance on each task for both treebanked and plain-tokenized versions of the data. The best systems obtain a mean accuracy of 71.19{\%} for relation classification, a mean F1 of 91.57 (Treebanked Track) and 87.38 (Plain Track) for segmentation, and a mean F1 of 81.53 (Treebanked Track) and 79.92 (Plain Track) for connective identification. The data and trained models of several participants can be found at https://huggingface.co/multilingual-discourse-hub."
}
@inproceedings{wu-zeldes-2025-unpacking,
    title = "Unpacking Ambiguity: The Interaction of Polysemous Discourse Markers and Non-{DM} Signals",
    author = "Wu, Jingni  and
      Zeldes, Amir",
    editor = "Strube, Michael  and
      Braud, Chloe  and
      Hardmeier, Christian  and
      Li, Junyi Jessy  and
      Loaiciga, Sharid  and
      Zeldes, Amir  and
      Li, Chuyuan",
    booktitle = "Proceedings of the 6th Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences (CODI 2025)",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.codi-1.2/",
    doi = "10.18653/v1/2025.codi-1.2",
    pages = "14--26",
    ISBN = "979-8-89176-343-2",
    abstract = "Discourse markers (DMs) like `but' or `then' are crucial for creating coherence in discourse, yet they are often replaced by or co-occur with non-DMs ({`}in the morning' can mean the same as `then'), and both can be ambiguous ({`}since' can refer to time or cause). The interaction mechanism between such signals remains unclear but pivotal for their disambiguation. In this paper we investigate the relationship between DM polysemy and co-occurrence of non-DM signals in English, as well as the influence of genre on these patterns. Using the framework of eRST, we propose a graded definition of DM polysemy, and conduct correlation and regression analyses to examine whether polysemous DMs are accompanied by more numerous and diverse non-DM signals. Our findings reveal that while polysemous DMs do co-occur with more diverse non-DMs, the total number of co-occurring signals does not necessarily increase. Moreover, genre plays a significant role in shaping DM-signal interactions."
}
@inproceedings{zeldes-etal-2025-ud,
    title = "A {UD} Treebank for Bohairic {C}optic",
    author = "Zeldes, Amir  and
      Speransky, Nina  and
      Wagner, Nicholas E.  and
      Schroeder, Caroline T.",
    editor = {Bouma, Gosse  and
      {\c{C}}{\"o}ltekin, {\c{C}}a{\u{g}}r{\i}},
    booktitle = "Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)",
    month = aug,
    year = "2025",
    address = "Ljubljana, Slovenia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.udw-1.7/",
    pages = "59--69",
    ISBN = "979-8-89176-292-3",
    abstract = "Despite recent advances in digital resources for other Coptic dialects, especially Sahidic, Bohairic Coptic, the main Coptic dialect for pre-Mamluk, late Byzantine Egypt, and the contemporary language of the Coptic Church, remains critically under-resourced. This paper presents and evaluates the first syntactically annotated corpus of Bohairic Coptic, sampling data from a range of works, including Biblical text, saints' lives and Christian ascetic writing. We also explore some of the main differences we observe compared to the existing UD treebank of Sahidic Coptic, the classical dialect of the language, and conduct joint and cross-dialect parsing experiments, revealing the unique nature of Bohairic as a related, but distinct variety from the more often studied Sahidic."
}
@inproceedings{levine2025building,
  author    = {Levine, Lauren and Min, Junghyun and Zeldes, Amir},
  title     = {Building UD Cairo for Old English in the Classroom},
  booktitle = {Proceedings of SyntaxFest 2025},
  year      = {2025},
  address   = {Ljubljana, Slovenia},
  url       = {https://arxiv.org/abs/2504.18718},
  abstract = {In this paper we present a sample treebank for Old English based on the UD Cairo sentences, collected and annotated as part of a classroom curriculum in Historical Linguistics. To collect the data, a sample of 20 sentences illustrating a range of syntactic constructions in the world's languages, we employ a combination of LLM prompting and searches in authentic Old English data. For annotation we assigned sentences to multiple students with limited prior exposure to UD, whose annotations we compare and adjudicate. Our results suggest that while current LLM outputs in Old English do not reflect authentic syntax, this can be mitigated by post-editing, and that although beginner annotators do not possess enough background to complete the task perfectly, taken together they can produce good results and learn from the experience. We also conduct preliminary parsing experiments using Modern English training data, and find that although performance on Old English is poor, parsing on annotated features (lemma, hyperlemma, gloss) leads to improved performance.}
}
@inproceedings{levine2025subjectivity,
  author    = {Levine, Lauren and Zeldes, Amir},
  title     = {Subjectivity in the Annotation of Bridging Anaphora},
  booktitle = {Proceedings of the 19th Linguistic Annotation Workshop (LAW-XIX)},
  year      = {2025},
  address   = {Vienna, Austria},
  url       = {https://arxiv.org/abs/2506.07297},
  abstract = {Bridging refers to the associative relationship between inferable entities in a discourse and the antecedents which allow us to understand them, such as understanding what "the door" means with respect to an aforementioned "house". As identifying associative relations between entities is an inherently subjective task, it is difficult to achieve consistent agreement in the annotation of bridging anaphora and their antecedents. In this paper, we explore the subjectivity involved in the annotation of bridging instances at three levels: anaphor recognition, antecedent resolution, and bridging subtype selection. To do this, we conduct an annotation pilot on the test set of the existing GUM corpus, and propose a newly developed classification system for bridging subtypes, which we compare to previously proposed schemes. Our results suggest that some previous resources are likely to be severely under-annotated. We also find that while agreement on the bridging subtype category was moderate, annotator overlap for exhaustively identifying instances of bridging is low, and that many disagreements resulted from subjective understanding of the entities involved.}
}
@inproceedings{levine-2025-signaling,
  author    = {Levine, Lauren},
  booktitle = {Proceedings of the Society for Computation in Linguistics (SCiL), Vol. 8},
  title     = {A Cross-Genre Analysis of Discourse Relation Signaling in the GUM Corpus},
  year      = {2025},
  address   = {Eugene, OR, USA},
  url       = {https://openpublishing.library.umass.edu/scil/article/id/3155/},
  abstract = {In this paper, we investigate the cross-genre variation in how discourse relations are signaled in the Georgetown University Mutilayer (GUM) Corpus, an English language corpus which contains 16 different genres of texts with various linguistic annotations, including Rhetorical Structure Theory (RST) style discourse annotations. We look at the proportions of discourse signals in each genre, and then we conduct an analysis of which discourse relations display the most inter-genre variation in how they are signaled, providing a methodology for ranking the inter-genre variability of the signaling of individual discourse relations. Although the way in which individual discourse relations are signaled in GUM is relatively stable across genres, we are able still to produce stable rankings, finding that organization, restatement, and explanation relations display the most inter-genre variation. However, we find that genre specific graphical norms can account for a large portion of the observed variation.}
}
@inproceedings{scivetti-etal-2025-multilingual,
    title = "Multilingual Supervision Improves Semantic Disambiguation of Adpositions",
    author = "Scivetti, Wesley  and
      Levine, Lauren  and
      Schneider, Nathan",
    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
    month = jan,
    year = "2025",
    address = "Abu Dhabi, UAE",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.coling-main.247/",
    pages = "3655--3669",
    abstract = "Adpositions display a remarkable amount of ambiguity and flexibility in their meanings, and are used in different ways across languages. We conduct a systematic corpus-based cross-linguistic investigation into the lexical semantics of adpositions, utilizing SNACS (Schneider et al., 2018), an annotation framework with data available in several languages. Our investigation encompasses 5 of these languages: Chinese, English, Gujarati, Hindi, and Japanese. We find substantial distributional differences in adposition semantics, even in comparable corpora. We further train classifiers to disambiguate adpositions in each of our languages. Despite the cross-linguistic differences in adpositional usage, sharing annotated data across languages boosts overall disambiguation performance, leading to the highest published scores on this task for all 5 languages."
}
@article{zeldes-etal-2025-erst,
    title = "e{RST}: A Signaled Graph Theory of Discourse Relations and Organization",
    author = "Zeldes, Amir  and
      Aoyama, Tatsuya  and
      Liu, Yang Janet  and
      Peng, Siyao  and
      Das, Debopam  and
      Gessler, Luke",
    journal = "Computational Linguistics",
    volume = "51",
    number = "1",
    month = mar,
    year = "2025",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/2025.cl-1.3/",
    doi = "10.1162/coli_a_00538",
    pages = "23--72",
    abstract = "In this article we present Enhanced Rhetorical Structure Theory (eRST), a new theoretical framework for computational discourse analysis, based on an expansion of Rhetorical Structure Theory (RST). The framework encompasses discourse relation graphs with tree-breaking, non-projective and concurrent relations, as well as implicit and explicit signals which give explainable rationales to our analyses. We survey shortcomings of RST and other existing frameworks, such as Segmented Discourse Representation Theory, the Penn Discourse Treebank, and Discourse Dependencies, and address these using constructs in the proposed theory. We provide annotation, search, and visualization tools for data, and present and evaluate a freely available corpus of English annotated according to our framework, encompassing 12 spoken and written genres with over 200K tokens. Finally, we discuss automatic parsing, evaluation metrics, and applications for data in our framework."
}
@inproceedings{liu-etal-2024-gdtb,
    title = "{GDTB}: Genre Diverse Data for {E}nglish Shallow Discourse Parsing across Modalities, Text Types, and Domains",
    author = "Liu, Yang Janet*  and
      Aoyama, Tatsuya*  and
      Scivetti, Wesley*  and
      Zhu, Yilun*  and
      Behzad, Shabnam  and
      Levine, Lauren  and
      Lin, Jessica  and
      Tiwari, Devika  and
      Zeldes, Amir",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.684",
    note = "(*equal contribution)", 
    pages = "12287--12303",
    abstract = "Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the existing UD English GUM corpus, for which discourse relation annotations in other frameworks already exist. In a series of experiments on cross-domain relation classification, we show that while our dataset is compatible with PDTB, substantial out-of-domain degradation is observed, which can be alleviated by joint training on both datasets.",
}

@inproceedings{behzad-etal-2024-ask,
    title = "To Ask {LLM}s about {E}nglish Grammaticality, Prompt Them in a Different Language",
    author = "Behzad, Shabnam  and
      Zeldes, Amir  and
      Schneider, Nathan",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.916",
    pages = "15622--15634",
    abstract = "In addition to asking questions about facts in the world, some internet users{---}in particular, second language learners{---}ask questions about language itself. Depending on their proficiency level and audience, they may pose these questions in an L1 (first language) or an L2 (second language). We investigate how multilingual LLMs perform at crosslingual metalinguistic question answering. Focusing on binary questions about sentence grammaticality constructed from error-annotated learner corpora, we prompt three LLMs (Aya, Llama, and GPT) in multiple languages, including English, German, Korean, Russian, and Ukrainian. Our study reveals that the language of the prompt can significantly affect model performance, and despite English being the dominant training language for all three models, prompting in a different language with questions about English often yields better results.",
}

@inproceedings{levine-zeldes-2024-unifying,
    title = "Unifying the Scope of Bridging Anaphora Types in {E}nglish: Bridging Annotations in {ARRAU} and {GUM}",
    author = "Levine, Lauren  and
      Zeldes, Amir",
    editor = "Ogrodniczuk, Maciej  and
      Nedoluzhko, Anna  and
      Poesio, Massimo  and
      Pradhan, Sameer  and
      Ng, Vincent",
    booktitle = "Proceedings of The Seventh Workshop on Computational Models of Reference, Anaphora and Coreference",
    month = nov,
    year = "2024",
    address = "Miami",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.crac-1.5",
    pages = "41--51",
}

@inproceedings{levine-etal-2024-lacuna,
    title = "Lacuna Language Learning: Leveraging {RNN}s for Ranked Text Completion in Digitized {C}optic Manuscripts",
    author = "Levine, Lauren  and
      Li, Cindy  and
      BremerMcCollum, Lydia  and
      Wagner, Nicholas  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)",
    month = aug,
    year = "2024",
    address = "Hybrid in Bangkok, Thailand and online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.ml4al-1.8",
    doi = "10.18653/v1/2024.ml4al-1.8",
    pages = "61--70",
    abstract = "Ancient manuscripts are frequently damaged, containing gaps in the text known as lacunae. In this paper, we present a bidirectional RNN model for character prediction of Coptic characters in manuscript lacunae. Our best model performs with 72{\%} accuracy on single character reconstruction, but falls to 37{\%} when reconstructing lacunae of various lengths. While not suitable for definitive manuscript reconstruction, we argue that our RNN model can help scholars rank the likelihood of textual reconstructions. As evidence, we use our RNN model to rank reconstructions in two early Coptic manuscripts. Our investigation shows that neural models can augment traditional methods of textual restoration, providing scholars with an additional tool to assess lacunae in Coptic manuscripts.",
}
@inproceedings{zhu-etal-2024-splice,
    title = "{SPLICE}: A Singleton-Enhanced {P}ipe{LI}ne for Coreference {RE}solution",
    author = "Zhu, Yilun  and
      Peng, Siyao  and
      Pradhan, Sameer  and
      Zeldes, Amir",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.1321",
    pages = "15191--15201",
    abstract = "Singleton mentions, i.e. entities mentioned only once in a text, are important to how humans understand discourse from a theoretical perspective. However previous attempts to incorporate their detection in end-to-end neural coreference resolution for English have been hampered by the lack of singleton mention spans in the OntoNotes benchmark. This paper addresses this limitation by combining predicted mentions from existing nested NER systems and features derived from OntoNotes syntax trees. With this approach, we create a near approximation of the OntoNotes dataset with all singleton mentions, achieving {\textasciitilde}94{\%} recall on a sample of gold singletons. We then propose a two-step neural mention and coreference resolution system, named SPLICE, and compare its performance to the end-to-end approach in two scenarios: the OntoNotes test set and the out-of-domain (OOD) OntoGUM corpus. Results indicate that reconstructed singleton training yields results comparable to end-to-end systems for OntoNotes, while improving OOD stability (+1.1 avg. F1). We conduct error analysis for mention detection and delve into its impact on coreference clustering, revealing that precision improvements deliver more substantial benefits than increases in recall for resolving coreference chains.",
}
@inproceedings{braud-etal-2024-disrpt,
    title = "{DISRPT}: A Multilingual, Multi-domain, Cross-framework Benchmark for Discourse Processing",
    author = "Braud, Chlo{\'e}  and
      Zeldes, Amir  and
      Rivi{\`e}re, Laura  and
      Liu, Yang Janet  and
      Muller, Philippe  and
      Sileo, Damien  and
      Aoyama, Tatsuya",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.447",
    pages = "4990--5005",
    abstract = "This paper presents DISRPT, a multilingual, multi-domain, and cross-framework benchmark dataset for discourse processing, covering the tasks of discourse unit segmentation, connective identification, and relation classification. DISRPT includes 13 languages, with data from 24 corpora covering about 4 millions tokens and around 250,000 discourse relation instances from 4 discourse frameworks: RST, SDRT, PDTB, and Discourse Dependencies. We present an overview of the data, its development across three NLP shared tasks on discourse processing carried out in the past five years, and the latest modifications and added extensions. We also carry out an evaluation of state-of-the-art multilingual systems trained on the data for each task, showing plateau performance on segmentation, but important room for improvement for connective identification and relation classification. The DISRPT benchmark employs a unified format that we make available on GitHub and HuggingFace in order to encourage future work on discourse processing across languages, domains, and frameworks.",
}
@inproceedings{weissweiler-etal-2024-ucxn,
    title = "{UC}xn: Typologically Informed Annotation of Constructions Atop {U}niversal {D}ependencies",
    author = {Weissweiler, Leonie  and
      B{\"o}bel, Nina  and
      Guiller, Kirian  and
      Herrera, Santiago  and
      Scivetti, Wesley  and
      Lorenzi, Arthur  and
      Melnik, Nurit  and
      Bhatia, Archna  and
      Sch{\"u}tze, Hinrich  and
      Levin, Lori  and
      Zeldes, Amir  and
      Nivre, Joakim  and
      Croft, William  and
      Schneider, Nathan},
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.1471",
    pages = "16919--16932",
    abstract = "The Universal Dependencies (UD) project has created an invaluable collection of treebanks with contributions in over 140 languages. However, the UD annotations do not tell the full story. Grammatical constructions that convey meaning through a particular combination of several morphosyntactic elements{---}for example, interrogative sentences with special markers and/or word orders{---}are not labeled holistically. We argue for (i) augmenting UD annotations with a {`}UCxn{'} annotation layer for such meaning-bearing grammatical constructions, and (ii) approaching this in a typologically informed way so that morphosyntactic strategies can be compared across languages. As a case study, we consider five construction families in ten languages, identifying instances of each construction in UD treebanks through the use of morphosyntactic patterns. In addition to findings regarding these particular constructions, our study yields important insights on methodology for describing and identifying constructions in language-general and language-particular ways, and lays the foundation for future constructional enrichment of UD treebanks.",
}
@inproceedings{poesio-etal-2024-universal,
    title = "Universal Anaphora: The First Three Years",
    author = "Poesio, Massimo  and
      Ogrodniczuk, Maciej  and
      Ng, Vincent  and
      Pradhan, Sameer  and
      Yu, Juntao  and
      Moosavi, Nafise Sadat  and
      Paun, Silviu  and
      Zeldes, Amir  and
      Nedoluzhko, Anna  and
      Nov{\'a}k, Michal  and
      Popel, Martin  and
      {\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k  and
      Zeman, Daniel",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.1484",
    pages = "17087--17100",
    abstract = "The aim of the Universal Anaphora initiative is to push forward the state of the art in anaphora and anaphora resolution by expanding the aspects of anaphoric interpretation which are or can be reliably annotated in anaphoric corpora, producing unified standards to annotate and encode these annotations, delivering datasets encoded according to these standards, and developing methods for evaluating models that carry out this type of interpretation. Although several papers on aspects of the initiative have appeared, no overall description of the initiative{'}s goals, proposals and achievements has been published yet except as an online draft. This paper aims to fill this gap, as well as to discuss its progress so far.",
}
@inproceedings{lin-zeldes-2024-gumsley,
    title = "{GUM}sley: Evaluating Entity Salience in Summarization for 12 {E}nglish Genres",
    author = "Lin, Jessica  and
      Zeldes, Amir",
    editor = "Graham, Yvette  and
      Purver, Matthew",
    booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = mar,
    year = "2024",
    address = "St. Julian{'}s, Malta",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.eacl-long.158",
    pages = "2575--2588",
    abstract = "As NLP models become increasingly capable of understanding documents in terms of coherent entities rather than strings, obtaining the most salient entities for each document is not only an important end task in itself but also vital for Information Retrieval (IR) and other downstream applications such as controllable summarization. In this paper, we present and evaluate GUMsley, the first entity salience dataset covering all named and non-named salient entities for 12 genres of English text, aligned with entity types, Wikification links and full coreference resolution annotations. We promote a strict definition of salience using human summaries and demonstrate high inter-annotator agreement for salience based on whether a source entity is mentioned in the summary. Our evaluation shows poor performance by pre-trained SOTA summarization models and zero-shot LLM prompting in capturing salient entities in generated summaries. We also show that predicting or providing salient entities to several model architectures enhances performance and helps derive higher-quality summaries by alleviating the entity hallucination problem in existing abstractive summarization.",
}
@inproceedings{zhu-etal-2023-incorporating,
    title = "Incorporating Singletons and Mention-based Features in Coreference Resolution via Multi-task Learning for Better Generalization",
    author = "Zhu, Yilun  and
      Peng, Siyao  and
      Pradhan, Sameer  and
      Zeldes, Amir",
    editor = "Park, Jong C.  and
      Arase, Yuki  and
      Hu, Baotian  and
      Lu, Wei  and
      Wijaya, Derry  and
      Purwarianti, Ayu  and
      Krisnadhi, Adila Alfa",
    booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = nov,
    year = "2023",
    address = "Nusa Dua, Bali",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.ijcnlp-short.14",
    pages = "121--130",
}
@inproceedings{liu-etal-2023-whats,
    title = "What{'}s Hard in {E}nglish {RST} Parsing? Predictive Models for Error Analysis",
    author = "Liu, Yang Janet  and
      Aoyama, Tatsuya  and
      Zeldes, Amir",
    editor = "Stoyanchev, Svetlana  and
      Joty, Shafiq  and
      Schlangen, David  and
      Dusek, Ondrej  and
      Kennington, Casey  and
      Alikhani, Malihe",
    booktitle = "Proceedings of the 24th Meeting of the Special Interest Group on Discourse and Dialogue",
    month = sep,
    year = "2023",
    address = "Prague, Czechia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.sigdial-1.3",
    pages = "31--42",
    abstract = "Despite recent advances in Natural Language Processing (NLP), hierarchical discourse parsing in the framework of Rhetorical Structure Theory remains challenging, and our understanding of the reasons for this are as yet limited. In this paper, we examine and model some of the factors associated with parsing difficulties in previous work: the existence of implicit discourse relations, challenges in identifying long-distance relations, out-of-vocabulary items, and more. In order to assess the relative importance of these variables, we also release two annotated English test-sets with explicit correct and distracting discourse markers associated with gold standard RST relations. Our results show that as in shallow discourse parsing, the explicit/implicit distinction plays a role, but that long-distance dependencies are the main challenge, while lack of lexical overlap is less of a problem, at least for in-domain parsing. Our final model is able to predict where errors will occur with an accuracy of 76.3{\%} for the bottom-up parser and 76.6{\%} for the top-down parser.",
}
@inproceedings{behzad-etal-2023-sentence,
    title = "Sentence-level Feedback Generation for {E}nglish Language Learners: Does Data Augmentation Help?",
    author = "Behzad, Shabnam  and
      Zeldes, Amir  and
      Schneider, Nathan",
    editor = "Mille, Simon",
    booktitle = "Proceedings of the 16th International Natural Language Generation Conference: Generation Challenges",
    month = sep,
    year = "2023",
    address = "Prague, Czechia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.inlg-genchal.8",
    pages = "53--59",
    abstract = "In this paper, we present strong baselines for the task of Feedback Comment Generation for Writing Learning. Given a sentence and an error span, the task is to generate a feedback comment explaining the error. Sentences and feedback comments are both in English. We experiment with LLMs and also create multiple pseudo datasets for the task, investigating how it affects the performance of our system. We present our results for the task along with extensive analysis of the generated comments with the aim of aiding future studies in feedback comment generation for English language learners.",
}
@inproceedings{braud-etal-2023-disrpt,
    title = "The {DISRPT} 2023 Shared Task on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification",
    author = "Braud, Chlo{\'e}  and
      Liu, Yang Janet  and
      Metheniti, Eleni  and
      Muller, Philippe  and
      Rivi{\`e}re, Laura  and
      Rutherford, Attapol  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 3rd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2023)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "The Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.disrpt-1.1",
    doi = "10.18653/v1/2023.disrpt-1.1",
    pages = "1--21",
    abstract = "In 2023, the third iteration of the DISRPT Shared Task (Discourse Relation Parsing and Treebanking) was held, dedicated to the underlying units used in discourse parsing across formalisms. Following the success of the 2019and 2021 tasks on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification, this iteration has added 10 new corpora, including 2 new languages (Thai and Italian) and 3 discourse treebanks annotated in the discourse dependency representation in addition to the previously included frameworks: RST, SDRT, and PDTB. In this paper, we review the data included in the Shared Task, which covers 26 datasets across 13 languages, survey and compare submitted systems, and report on system performance on each task for both annotated and plain-tokenized versions of the data.",
}
@inproceedings{aoyama-etal-2023-gentle,
    title = "{GENTLE}: A Genre-Diverse Multilayer Challenge Set for {E}nglish {NLP} and Linguistic Evaluation",
    author = "Aoyama, Tatsuya  and
      Behzad, Shabnam  and
      Gessler, Luke  and
      Levine, Lauren  and
      Lin, Jessica  and
      Liu, Yang Janet  and
      Peng, Siyao  and
      Zhu, Yilun  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.law-1.17",
    doi = "10.18653/v1/2023.law-1.17",
    pages = "166--178",
    abstract = "We present GENTLE, a new mixed-genre English challenge corpus totaling 17K tokens and consisting of 8 unusual text types for out-of-domain evaluation: dictionary entries, esports commentaries, legal documents, medical notes, poetry, mathematical proofs, syllabuses, and threat letters. GENTLE is manually annotated for a variety of popular NLP tasks, including syntactic dependency parsing, entity recognition, coreference resolution, and discourse parsing. We evaluate state-of-the-art NLP systems on GENTLE and find severe degradation for at least some genres in their performance on all tasks, which indicates GENTLE{'}s utility as an evaluation dataset for NLP systems.",
}
@inproceedings{behzad-etal-2023-elqa,
    title = "{ELQA}: A Corpus of Metalinguistic Questions and Answers about {E}nglish",
    author = "Behzad, Shabnam  and
      Sakaguchi, Keisuke  and
      Schneider, Nathan  and
      Zeldes, Amir",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.113",
    doi = "10.18653/v1/2023.acl-long.113",
    pages = "2031--2047",
    abstract = "We present ELQA, a corpus of questions and answers in and about the English language. Collected from two online forums, the {\textgreater}70k questions (from English learners and others) cover wide-ranging topics including grammar, meaning, fluency, and etymology. The answers include descriptions of general properties of English vocabulary and grammar as well as explanations about specific (correct and incorrect) usage examples. Unlike most NLP datasets, this corpus is metalinguistic{---}it consists of language about language. As such, it can facilitate investigations of the metalinguistic capabilities of NLU models, as well as educational applications in the language learning domain. To study this, we define a free-form question answering task on our dataset and conduct evaluations on multiple LLMs (Large Language Models) to analyze their capacity to generate metalinguistic answers.",
}
@inproceedings{liu-zeldes-2023-gumsum,
    title = "{GUMS}um: Multi-Genre Data and Evaluation for {E}nglish Abstractive Summarization",
    author = "Liu, Yang Janet  and
      Zeldes, Amir",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.593",
    doi = "10.18653/v1/2023.findings-acl.593",
    pages = "9315--9327",
    abstract = "Automatic summarization with pre-trained language models has led to impressively fluent results, but is prone to {`}hallucinations{'}, low performance on non-news genres, and outputs which are not exactly summaries. Targeting ACL 2023{'}s {`}Reality Check{'} theme, we present GUMSum, a small but carefully crafted dataset of English summaries in 12 written and spoken genres for evaluation of abstractive summarization. Summaries are highly constrained, focusing on substitutive potential, factuality, and faithfulness. We present guidelines and evaluate human agreement as well as subjective judgments on recent system outputs, comparing general-domain untuned approaches, a fine-tuned one, and a prompt-based approach, to human performance. Results show that while GPT3 achieves impressive scores, it still underperforms humans, with varying quality across genres. Human judgments reveal different types of errors in supervised, prompted, and human-generated summaries, shedding light on the challenges of producing a good summary.",
}
@inproceedings{liu-zeldes-2023-cant,
    title = "Why Can{'}t Discourse Parsing Generalize? A Thorough Investigation of the Impact of Data Diversity",
    author = "Liu, Yang Janet  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
    month = may,
    year = "2023",
    address = "Dubrovnik, Croatia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.eacl-main.227",
    pages = "3112--3130",
    abstract = "Recent advances in discourse parsing performance create the impression that, as in other NLP tasks, performance for high-resource languages such as English is finally becoming reliable. In this paper we demonstrate that this is not the case, and thoroughly investigate the impact of data diversity on RST parsing stability. We show that state-of-the-art architectures trained on the standard English newswire benchmark do not generalize well, even within the news domain. Using the two largest RST corpora of English with text from multiple genres, we quantify the impact of genre diversity in training data for achieving generalization to text types unseen during training. Our results show that a heterogeneous training regime is critical for stable and generalizable models, across parser architectures. We also provide error analyses of model outputs and out-of-domain performance. To our knowledge, this study is the first to fully evaluate cross-corpus RST parsing generalizability on complete trees, examine between-genre degradation within an RST corpus, and investigate the impact of genre diversity in training data composition.",
}
@inproceedings{lin-2022-leveraging,
    title = "Leveraging World Knowledge in Implicit Hate Speech Detection",
    author = "Lin, Jessica",
    editor = "Biester, Laura  and
      Demszky, Dorottya  and
      Jin, Zhijing  and
      Sachan, Mrinmaya  and
      Tetreault, Joel  and
      Wilson, Steven  and
      Xiao, Lu  and
      Zhao, Jieyu",
    booktitle = "Proceedings of the Second Workshop on NLP for Positive Impact (NLP4PI)",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates (Hybrid)",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.nlp4pi-1.4",
    doi = "10.18653/v1/2022.nlp4pi-1.4",
    pages = "31--39",
    abstract = "While much attention has been paid to identifying explicit hate speech, implicit hateful expressions that are disguised in coded or indirect language are pervasive and remain a major challenge for existing hate speech detection systems. This paper presents the first attempt to apply Entity Linking (EL) techniques to both explicit and implicit hate speech detection, where we show that such real world knowledge about entity mentions in a text does help models better detect hate speech, and the benefit of adding it into the model is more pronounced when explicit entity triggers (e.g., rally, KKK) are present. We also discuss cases where real world knowledge does not add value to hate speech detection, which provides more insights into understanding and modeling the subtleties of hate speech.",
}
@inproceedings{zeldes-etal-2022-second,
    title = "A Second Wave of {UD} {H}ebrew Treebanking and Cross-Domain Parsing",
    author = "Zeldes, Amir  and
      Howell, Nick  and
      Ordan, Noam  and
      Ben Moshe, Yifat",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.292",
    pages = "4331--4344",
    abstract = "Foundational Hebrew NLP tasks such as segmentation, tagging and parsing, have relied to date on various versions of the Hebrew Treebank (HTB, Sima{'}an et al. 2001). However, the data in HTB, a single-source newswire corpus, is now over 30 years old, and does not cover many aspects of contemporary Hebrew on the web. This paper presents a new, freely available UD treebank of Hebrew stratified from a range of topics selected from Hebrew Wikipedia. In addition to introducing the corpus and evaluating the quality of its annotations, we deploy automatic validation tools based on grew (Guillaume, 2021), and conduct the first cross domain parsing experiments in Hebrew. We obtain new state-of-the-art (SOTA) results on UD NLP tasks, using a combination of the latest language modelling and some incremental improvements to existing transformer based approaches. We also release a new version of the UD HTB matching annotation scheme updates from our new corpus.",
}	
@inproceedings{peng-etal-2022-gcdt,
    title = "{GCDT}: A {C}hinese {RST} Treebank for Multigenre and Multilingual Discourse Parsing",
    author = "Peng, Siyao  and
      Liu, Yang Janet  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
    month = nov,
    year = "2022",
    address = "Online only",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.aacl-short.47",
    pages = "382--391",
    abstract = "A lack of large-scale human-annotated data has hampered the hierarchical discourse parsing of Chinese. In this paper, we present GCDT, the largest hierarchical discourse treebank for Mandarin Chinese in the framework of Rhetorical Structure Theory (RST). GCDT covers over 60K tokens across five genres of freely available text, using the same relation inventory as contemporary RST treebanks for English. We also report on this dataset{'}s parsing experiments, including state-of-the-art (SOTA) scores for Chinese RST parsing and RST parsing on the English GUM dataset, using cross-lingual training in Chinese and English with multilingual embeddings.",
}					 
@inproceedings{zabokrtsky-etal-2022-findings,
    title = "Findings of the Shared Task on Multilingual Coreference Resolution",
    author = "\v{Z}abokrtsk\'{y}, Zden\'{e}k  and
      Konop\'{i}k, Miloslav  and
      Nedoluzhko, Anna  and
      Nov\'{a}k, Michal  and
      Ogrodniczuk, Maciej  and
      Popel, Martin  and
      Pra\v{z}\'{a}k, Ond\v{r}ej  and
      Sido, Jakub  and
      Zeman, Daniel  and
      Zhu, Yilun",
    booktitle = "Proceedings of the CRAC 2022 Shared Task on Multilingual Coreference Resolution",
    month = oct,
    year = "2022",
    address = "Gyeongju, Republic of Korea",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.crac-mcr.1",
    pages = "1--17",
    abstract = "This paper presents an overview of the shared task on multilingual coreference resolution associated with the CRAC 2022 workshop. Shared task participants were supposed to develop trainable systems capable of identifying mentions and clustering them according to identity coreference. The public edition of CorefUD 1.0, which contains 13 datasets for 10 languages, was used as the source of training and evaluation data. The CoNLL score used in previous coreference-oriented shared tasks was used as the main evaluation metric. There were 8 coreference prediction systems submitted by 5 participating teams; in addition, there was a competitive Transformer-based baseline system provided by the organizers at the beginning of the shared task. The winner system outperformed the baseline by 12 percentage points (in terms of the CoNLL scores averaged across all datasets for individual languages).",
}
@inproceedings{levine-2022-sharing,
    title = "Sharing Data by Language Family: Data Augmentation for {R}omance Language Morpheme Segmentation",
    author = "Levine, Lauren",
    booktitle = "Proceedings of the 19th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology",
    month = jul,
    year = "2022",
    address = "Seattle, Washington",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.sigmorphon-1.12",
    pages = "117--123",
    abstract = "This paper presents a basic character level sequence-to-sequence approach to morpheme segmentation for the following Romance languages: French, Italian, and Spanish. We experiment with adding a small set of additional linguistic features, as well as with sharing training data between sister languages for morphological categories with low performance in single language base models. We find that while the additional linguistic features were generally not helpful in this instance, data augmentation between sister languages did help to raise the scores of some individual morphological categories, but did not consistently result in an overall improvement when considering the aggregate of the categories.",
}								 
@inproceedings{nedoluzhko-et-al-2022-corefud,
	address = {Marseille, France},
	title = {{CorefUD} 1.0: Coreference Meets {U}niversal {D}ependencies},
	url = {https://aclanthology.org/2022.lrec-1.520/},
	abstract = {Recent advances in standardization for annotated language resources have led to successful large scale efforts, such as the Universal Dependencies (UD) project for multilingual syntactically annotated data. By comparison, the important task of coreference resolution, which clusters multiple mentions of entities in a text, has yet to be standardized in terms of data formats or annotation guidelines. In this paper we present CorefUD, a multilingual collection of corpora and a standardized format for coreference resolution, compatible with morphosyntactic annotations in the UD framework and including facilities for related tasks such as named entity recognition, which forms a first step in the direction of convergence for coreference resolution across languages.},
	booktitle = {Proceedings of {LREC2022}},
	publisher = {European Language Resources Association},
	author = {Nedoluzhko, Anna and Nov\'{a}k, Michal and Popel, Martin and \v{Z}abokrtsk\'{y}, Zden\v{e}k and Zeldes, Amir and Zeman, Daniel },
	month = jun,
	year = {2022},
	pages = {4859--4872},
}
@inproceedings{gessler_midas_2022,
	address = {Marseille, France},
	title = {Midas {Loop}: {A} {Prioritized} {Human}-in-the-{Loop} {Annotation} for {Large} {Scale} {Multilayer} {Data}},
	url = {https://aclanthology.org/2022.lawxvi-1.13},
	abstract = {Large scale annotation of rich multilayer corpus data is expensive and time consuming, motivating approaches that integrate high quality automatic tools with active learning in order to prioritize human labeling of hard cases. A related challenge in such scenarios is the concurrent management of automatically annotated data and human annotated data, particularly where different subsets of the data have been corrected for different types of annotation and with different levels of confidence. In this paper we present Midas Loop, a collaborative, version-controlled online annotation environment for multilayer corpus data which includes integrated provenance and confidence metadata for each piece of information at the document, sentence, token and annotation level. We present a case study on improving annotation quality in an existing multilayer parse bank of English called AMALGUM, focusing on active learning in corpus preprocessing, at the surprisingly challenging level of sentence segmentation. Our results show improvements to state-of-the-art sentence segmentation and a promising workflow for getting "silver" data to approach gold standard quality.},
	booktitle = {Proceedings of {The} 16th {Lingusitic} {Annotation} {Workshop} ({LAW}-{XVI}) within {LREC2022}},
	publisher = {European Language Resources Association},
	author = {Gessler, Luke and Levine, Lauren and Zeldes, Amir},
	month = jun,
	year = {2022},
	pages = {103--110},
}
@inproceedings{gessler-zeldes-2022-microbert,
    title = "{M}icro{BERT}: Effective Training of Low-resource Monolingual {BERT}s through Parameter Reduction and Multitask Learning",
    author = "Gessler, Luke  and
      Zeldes, Amir",
    editor = {Ataman, Duygu  and
      Gonen, Hila  and
      Ruder, Sebastian  and
      Firat, Orhan  and
      G{\"u}l Sahin, G{\"o}zde  and
      Mirzakhalov, Jamshidbek},
    booktitle = "Proceedings of the 2nd Workshop on Multi-lingual Representation Learning (MRL)",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates (Hybrid)",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.mrl-1.9",
    doi = "10.18653/v1/2022.mrl-1.9",
    pages = "86--99",
    abstract = "BERT-style contextualized word embedding models are critical for good performance in most NLP tasks, but they are data-hungry and therefore difficult to train for low-resource languages. In this work, we investigate whether a combination of greatly reduced model size and two linguistically rich auxiliary pretraining tasks (part-of-speech tagging and dependency parsing) can help produce better BERTs in a low-resource setting. Results from 7 diverse languages indicate that our model, MicroBERT, is able to produce marked improvements in downstream task evaluations, including gains up to 18{\%} for parser LAS and 11{\%} for NER F1 compared to an mBERT baseline, and we achieve these results with less than 1{\%} of the parameter count of a multilingual BERT base{--}sized model. We conclude that training very small BERTs and leveraging any available labeled data for multitask learning during pretraining can produce models which outperform both their multilingual counterparts and traditional fixed embeddings for low-resource languages.",
}
@inproceedings{lin-zeldes-2021-wikigum,
    title = {{W}iki{GUM}: Exhaustive Entity Linking for Wikification in 12 Genres},
    author = {Jessica Lin and Amir Zeldes},
    booktitle = {Proceedings of The Joint 15th Linguistic Annotation Workshop (LAW) and 3rd Designing Meaning Representations (DMR) Workshop (LAW-DMR 2021)},
    month = nov,
    year = {2021},
    address = {Punta Cana, Dominican Republic},
    url = {https://aclanthology.org/2021.law-1.18},
    pages = {170--175},
    comment = {paper}
}
@InProceedings{ZeldesEtAl2021,
  author    = {Amir Zeldes and Yang Janet Liu and Mikel Iruskieta and Philippe Muller and Chlo\'{e} Braud and Sonia Badene},
  booktitle = "Proceedings of the 2nd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2021)",
  title     = {The {DISRPT} 2021 Shared Task on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification},
  year      = {2021},
  address   = {Punta Cana, Dominican Republic},
  pages     = {1--12},
  url = {https://aclanthology.org/2021.disrpt-1.1},
  comment = {paper}
}
@inproceedings{gessler-etal-2021-discodisco,
    title = "{D}is{C}o{D}is{C}o at the {DISRPT}2021 Shared Task: A System for Discourse Segmentation, Classification, and Connective Detection",
    author = "Gessler, Luke and
      Behzad, Shabnam  and
      Liu, Yang Janet  and
      Peng, Siyao  and
      Zhu, Yilun  and
      Zeldes, Amir",
    booktitle = "Proceedings of the 2nd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2021)",
    month = nov,
    year = "2021",
    address = "Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.disrpt-1.6",
    pages = "51--62",
    comment = {paper}
}
@inproceedings{zhu-etal-2021-anatomy,
    title = {Anatomy of {O}nto{GUM}{---}{A}dapting {GUM} to the {O}nto{N}otes Scheme to Evaluate Robustness of {SOTA} Coreference Algorithms},
    author = {Yilun Zhu and Sameer Pradhan and Amir Zeldes},
    booktitle = {Proceedings of the Fourth Workshop on Computational Models of Reference, Anaphora and Coreference (CRAC 2021)},
    month = nov,
    year = {2021},
    address = {Punta Cana, Dominican Republic},
    url = {https://aclanthology.org/2021.crac-1.15},
    pages = {141--149},
    comment = {paper}
}										
@InProceedings{ZhuEtAl2021,
  author    = {Yilun Zhu and Sameer Pradhan and Amir Zeldes},
  booktitle = {Proceedings of ACL-IJCNLP 2021},
  title     = {{OntoGUM}: Evaluating Contextualized {SOTA} Coreference Resolution on 12 More Genres},
  year      = {2021},
  address   = {Bangkok, Thailand},
  pages     = {461--467},
  url       = {https://aclanthology.org/2021.acl-short.59.pdf},
  comment = {paper}
}
@InProceedings{ManningEtAl2021,
  author    = {Emma Manning and Nathan Schneider and Amir Zeldes},
  booktitle = {Fifth Workshop on Teaching NLP at NAACL 2021},
  title     = {A Balanced and Broadly Targeted Computational Linguistics Curriculum},
  year      = {2021},
  address   = {Mexico City, Mexico},
  pages     = {65--69},
  url       = {https://www.aclweb.org/anthology/2021.teachingnlp-1.11/},
  comment = {paper}
}
@InProceedings{GesslerEtAl2021,
  author    = {Luke Gessler and Siyao Peng and Yang Liu and Yilun Zhu and Shabnam Behzad and Amir Zeldes},
  booktitle = {Proceedings of the Society for Computation in Linguistics (SCiL), Vol. 4},
  title     = {Overview of {AMALGUM} -- Large Silver Quality Annotations across {E}nglish Genres},
  year      = {2021},
  address   = {online},
  pages     = {434--437},
  url       = {https://scholarworks.umass.edu/scil/vol4/iss1/53/},
  comment = {paper}
}
@InProceedings{ZeldesMartinTu2020,
  author    = {Amir Zeldes and Lance Martin and Sichang Tu},
  booktitle = {Proceedings of the SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2020)},
  title     = {Exhaustive Entity Recognition for {C}optic: Challenges And Solutions},
  year      = {2020},
  address   = {Barcelona, Spain},
  pages     = {19--28},
  url       = {https://www.aclweb.org/anthology/2020.latechclfl-1.3.pdf},
  comment = {paper}
}
@Article{ZeldesLiu2020,
  author  = {Amir Zeldes and Yang Liu},
  journal = {Dialogue and Discourse},
  title   = {A Neural Approach to Discourse Relation Signal Detection},
  year    = {2020},
  number  = {2},
  pages   = {1-33},
  volume  = {11},
  comment = {paper},
  url     = {https://journals.uic.edu/ojs/index.php/dad/article/view/11372/9733},
}
@InProceedings{GesslerEtAl2020,
  author    = {Luke Gessler and Siyao Peng and Yang Liu and Yilun Zhu and Shabnam Behzad and Amir Zeldes},
  booktitle = {Proceedings of LREC 2020},
  title     = {{AMALGUM} - A Free, Balanced, Multilayer {E}nglish Web Corpus},
  year      = {2020},
  address   = {Marseille, France},
  pages     = {5267--5275},
  comment = {paper},
  url       = {https://www.aclweb.org/anthology/2020.lrec-1.648.pdf},
}
@Article{SchroederZeldes2020,
  author  = {Caroline T. Schroeder and Amir Zeldes},
  journal = {Journal of Data Mining and Digital Humanities. Special Issue on Collecting, Preserving, and Disseminating Endangered Cultural Heritage for New Understandings through Multilingual Approaches},
  title   = {A Collaborative Ecosystem for Digital {C}optic Studies},
  year    = {2020},
  pages   = {1--9},
  comment = {paper},
  url     = {https://jdmdh.episciences.org/6797/pdf},
}
@InProceedings{SanguinettiBoscoCassidyEtAl2020,
  author    = {Manuela Sanguinetti and Cristina Bosco and Lauren Cassidy and \"{O}zlem Çetinoǧlu and Alessandra Teresa Cignarella and Teresa Lynn and Ines Rehbein and Josef Ruppenhofer and Djam\'{e} Seddah and Amir Zeldes},
  booktitle = {Proceedings of LREC 2020},
  title     = {Treebanking User-Generated Content: A Proposal for a Unified Representation in {U}niversal {D}ependencies},
  year      = {2020},
  address   = {Marseille, France},
  pages     = {5240--5250},
  comment = {paper},
  url       = {https://www.aclweb.org/anthology/2020.lrec-1.645.pdf},
}
@inproceedings{behzad-zeldes-2020-cross,
    title = "A Cross-Genre Ensemble Approach to Robust {R}eddit Part of Speech Tagging",
    author = "Shabnam Behzad and
      Amir Zeldes",
    booktitle = "Proceedings of the 12th Web as Corpus Workshop",
    year = "2020",
    address = "Marseille, France",
    publisher = "European Language Resources Association",
    url = "https://www.aclweb.org/anthology/2020.wac-1.7",
    pages = "50--56",
    language = "English",
    ISBN = "979-10-95546-68-9",
    comment = {paper}
}
@InProceedings{YuZhuLiuEtAl2019,
  author    = {Yue Yu and Yilun Zhu and Yang Liu and Yan Liu and Siyao Peng and Mackenzie Gong and Amir Zeldes},
  title     = {GumDrop at the DISRPT2019 Shared Task: A Model Stacking Approach to Discourse Unit Segmentation and Connective Detection},
  booktitle = {Proceedings of Discourse Relation Treebanking and Parsing (DISRPT 2019)},
  year      = {2019},
  pages     = {133--143},
  address   = {Minneapolis, MN},
  url = {https://www.aclweb.org/anthology/W19-2717},
  comment = {paper}
}

@InProceedings{ZeldesDasMazieroEtAl2019,
  author    = {Amir Zeldes and Debopam Das and Erick Galani Maziero and Juliano Desiderato Antonio and Mikel Iruskieta},
  title     = {{The DISRPT 2019 Shared Task} on Elementary Discourse Unit Segmentation and Connective Detection},
  booktitle = {Proceedings of Discourse Relation Treebanking and Parsing (DISRPT 2019)},
  year      = {2019},
  address   = {Minneapolis, MN},
  pages = {97--104},
  url = {https://www.aclweb.org/anthology/W19-2713},
  comment = {paper}
}

@InProceedings{GesslerLiuZeldes2019Signals,
  author    = {Luke Gessler and Yang Liu and Amir Zeldes},
  title     = {A Discourse Signal Annotation System for RST Trees},
  year      = {2019},
  booktitle = {Proceedings of the Workshop on Discourse Relation Parsing and Treebanking 2019},
  address   = {Minneapolis, MN},
  url = {https://aclweb.org/anthology/papers/W/W19/W19-2708/},
  pages = {56--61},
  comment = {paper}
}

@article{liu2019discourse,
  title={Discourse Relations and Signaling Information: Anchoring Discourse Signals in RST-DT},
  author={Yang Liu and Amir Zeldes},
  journal={Proceedings of the Society for Computation in Linguistics},
  volume={2},
  number={1},
  pages={314--317},
  year={2019}
}

@InProceedings{Gessler2019computel,
 title={{Developing without developers: choosing labor-saving tools for language documentation apps}},
  author={Luke Gessler},
  journal={{Proceedings of the 3rd Workshop on the Use of Computational Methods in the Study of Endangered Languages}},
  volume={1},
  pages={6--13},
  year={2019},
  url={https://computel-workshop.org/wp-content/uploads/2019/02/CEL3_book_papers_draft.pdf#page=18},
  comment={paper}
}

@InProceedings{ZeldesAbramsUDW2018,
  author    = {Amir Zeldes and Mitchell Abrams},
  title     = {The {C}optic Universal Dependency Treebank},
  year      = {2018},
  booktitle = {Proceedings of the Universal Dependencies Workshop 2018 (UDW 2018)},										
  address   = {Brussels, Belgium},
  url = {http://aclweb.org/anthology/W18-6022},
  pages = {192-201},
  comment = {paper}
}
@InProceedings{Zeldes2018SIGMORPHON,
  author    = {Amir Zeldes},
  title     = {A Characterwise Windowed Approach to {H}ebrew Morphological Segmentation},
  booktitle = {Proceedings of the 15th {SIGMORPHON} Workshop on Computational Research in Phonetics, Phonology, and Morphology},
  year      = {2018},
  address   = {Brussels, Belgium},
  pages = {101-110},
  url = {http://aclweb.org/anthology/W18-5811},
  comment = {paper}
}
@InProceedings{Zeldes2018AACL,
  author    = {Amir Zeldes},
  title     = {A Multi-Dimensional Analysis of {RST} Discourse Relations in Eight Genres},
  booktitle = {14th American Association of Corpus Linguistics Conference (AACL 2018)},
  year      = {2018},
  address   = {Atlanta, GA},
  url = {https://gucorpling.org/amir/pdf/AACL_2018_MD_RST.pdf},
  comment = {abstract}
}
@InProceedings{PengZeldes2018AACL,
  author    = {Siyao Peng and Amir Zeldes},
  title     = {Validating and Merging a Growing Multilayer Corpus – the Case of {GUM}},
  booktitle = {14th American Association of Corpus Linguistics Conference (AACL 2018)},
  year      = {2018},
  address   = {Atlanta, GA},
  url = {https://gucorpling.org/amir/pdf/AACL2018_gum_abstract.pdf},
  comment = {abstract}
}
@InProceedings{FederKupreyevManningEtAl2018,
  author    = {Frank Feder and Maxim Kupreyev and Emma Manning and Caroline T. Schroeder and Amir Zeldes},
  title     = {A Linked {C}optic Dictionary Online},
  booktitle = {Proceedings of LaTeCH 2018 - The 11th SIGHUM Workshop at COLING2018},
  year      = {2018},
  address   = {Santa Fe, NM},
  pages = {12-21},
  url = {http://aclweb.org/anthology/W18-4502},
  comment = {paper}
}
@InProceedings{PengZeldes2018COLING,
  author    = {Siyao Peng and Amir Zeldes},
  title     = {All Roads Lead to {UD}: Converting Stanford and Penn Parses to {E}nglish Universal Dependencies with Multilayer Annotations},
  booktitle = {{COLING} 2018 Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions ({LAW-MWE-CxG-2018})},
  year      = {2018},
  address   = {Santa Fe, NM},
  pages = {167-177},
  url = {http://aclweb.org/anthology/W18-4918},
  comment = {paper}
}
@InCollection{Zeldes2018SC,
  author    = {Amir Zeldes},
  title     = {Compounds and Productivity in Advanced {L2} {G}erman Writing: A Constructional Approach},
  booktitle = {Usage-inspired L2 Instruction: Researched Pedagogy},
  year      = {2018},
  editor    = {Lourdes Ortega and Andrea Tyler and Mariko Uno and Hae In Park},
  note      = {3},
  publisher = {John Benjamins},
  pages     = {237-265},
  url = {https://gucorpling.org/amir/pdf/Compounds_and_Productivity_Zeldes_revised.pdf},
  comment = {prepub version},
  doi = {https://doi.org/10.1075/lllt.49.11zel},
  address   = {Amsterdam}
 }
@InProceedings{Zeldes2018CRAC,
  author    = {Amir Zeldes},
  title     = {A Predictive Model for Notional Anaphora in {E}nglish},
  booktitle = {{NAACL} 2018 Workshop on Computational Models of Reference, Anaphora, and Coreference ({CRAC})},
  year      = {2018},
  address   = {New Orleans, LA},
  pages = {34-43},
  url = {http://aclweb.org/anthology/W18-0704},
  comment = {paper}
}
@InProceedings{MacAvaneyZeldes2018,
  author    = {Sean MacAvaney and Amir Zeldes},
  title     = {A Deeper Look into Dependency-Based Word Embeddings},
  booktitle = {NAACL 2018 Student Research Workshop},
  year      = {2018},
  pages = {40-45},
  url = {http://aclweb.org/anthology/N18-4006},
  address   = {New Orleans, LA}
}
@article{Simpson_etal,
  author={Sean Simpson and Nikki Adams and Claudia M. Brugman and Thomas J. Conners},
  title={Detecting Novel and Emerging Drug Terms Using Natural Language Processing: A Social Media Corpus Study},
  journal={Journal of Medical Internet Research: Public Health and Surveillance},
  year={2018},
  volume={4},
  number={1},
  pages={e2},
  url={http://publichealth.jmir.org/2018/1/e2/}
}
@InCollection{Simpson_2017,
  author = {Belew Anna and Sean Simpson},
  title = {{Language extinction then and now}},
  booktitle = {{Cataloguing the Endangered Languages of the World}},
  year = {2018},
  editor = {Lyle Campbell and Anna Belew},
  publisher = {Taylor and Francis},
  address={Abingdon}
}

@InCollection{Simpson_2017a,
  author = {Raina Heaton and Sean Simpson},
  title = {{How ELCat Serves Communities whose Languages are at Risk}},
  booktitle = {{Cataloguing the Endangered Languages of the World}},
  year = {2018},
  editor = {Lyle Campbell and John Van Way and Anna Belew},
  publisher = {Taylor and Francis},
  address={Abingdon}
}
@InCollection{Belew_2017,
  author = {Anna Belew and Sean Simpson},
  title = {{The Status of the World's Endangered Languages}},
  booktitle = {{The Oxford Handbook of Endangered Languages}},
  year = {2018},
  editor = {Lyle Campbell and Kenneth Regh},
  publisher = {Oxford University Press},
  address={Oxford}
}
@InProceedings{Zeldes2018GURT,
  author    = {Amir Zeldes},
  title     = {A Neural Approach to Discourse Relation Signaling},
  booktitle = {Georgetown University Round Table (GURT) 2018: Approaches to Discourse},
  year      = {2018},
  address   = {Washington, DC},
  url = {https://gucorpling.org/amir/pdf/neural_DR_abstract_GURT2018.pdf},
  comment = {abstract},
  url2 = {https://gucorpling.org/amir/pdf/RST_GURT2018_final.pdf},
  comment2 = {slides}
}

@InProceedings{Simpson2017,
  author = {Sean Simpson},
  title = {{Reconstructing Thread Structure For Reddit Comments}},
  journal = {{Mid-Atlantic Student Colloquium on Speech, Language and Learning (MASC-SLL)}},
  address = {{Washington, DC}},
  year = {2017}
}

@PhdThesis{Zhang2017,
  type = {PhD Thesis},
  title = {Mining Linguistic Tone Patterns Using Fundamental Frequency Time-Series Data},
  author = {Shuo Zhang},
  school = {Georgetown University},
  url = {https://repository.library.georgetown.edu/handle/10822/1047816},
  year = {2017}
}

@InProceedings{Zeldes2017a,
  author    = {Amir Zeldes},
  title     = {A Distributional View of Discourse Encapsulation: Multifactorial Prediction of Coreference Density in {RST}},
  booktitle = {6th Workshop on Recent Advances in RST and Related Formalisms at INLG},
  year      = {2017},
  pages     = {20-28},
  address   = {Santiago de Compostela, Spain},
  url = {http://aclweb.org/anthology/W17-3603},
  comment = {paper}
}
@InProceedings{ZhangZeldes2017,
  author    = {Shuo Zhang and Amir Zeldes},
  title     = {{GitDOX}: A Linked Version Controlled Online {XML} Editor for Manuscript Transcription},
  booktitle = {Proceedings of FLAIRS-30},
  year      = {2017},
  pages     = {619-623},
  address   = {Marco Island, FL},
  url = {https://gucorpling.org/amir/pdf/GitDOX_A_Linked_Version_Controlled_Online_XML_Editor_prepub.pdf},
  comment = {paper}
}
@Article{Zeldes2017b,
  author    = {Amir Zeldes},
  title     = {The {GUM} Corpus: Creating Multilayer Resources in the Classroom},
  journal   = {Language Resources and Evaluation},
  year      = {2017},
  volume    = {51},
  number    = {3},
  pages     = {581-612},
  url = {https://gucorpling.org/amir/pdf/GUM_paper_prepub.pdf},
  comment = {prepub version},
  doi = {http://dx.doi.org/10.1007/s10579-016-9343-x},
  url2 = {http://rdcu.be/uONb},
  comment2 = {read online}
}
@Article{GaetaZeldes2017,
  author    = {Livio Gaeta and Amir Zeldes},
  title     = {Between {VP} and {NN}: On the Constructional Types of German -er Compounds},
  journal   = {Constructions and Frames},
  year      = {2017},
  volume    = {9},
  number    = {1},
  url = {https://gucorpling.org/amir/pdf/Between_VP_and_NN_prepub.pdf},
  comment = {prepub version},
  doi = {https://doi.org/10.1075/cf.9.1.01gae}
}
@PhdThesis{simonsondiss17,
    title={Investigations of the Properties of Narrative Schemas},
    author={Dan Simonson},
	type={PhD Thesis},
    year={2017},
    school={Georgetown University}
}
@Article{SpalekZeldes2017,
  author    = {Katharina Spalek and Amir Zeldes},
  title     = {Converging Evidence for the Relevance of Alternative Sets: Data from {NP}s with Focus Sensitive Particles in {G}erman},
  journal   = {Language and Cognition},
  year      = {2017},
  volume    = {9},
  number    = {1},
  pages     = {24-51},
  doi = {http://dx.doi.org/10.1017/langcog.2015.12},
  url = {https://gucorpling.org/amir/pdf/alternatives_prepub.pdf},
  comment = {prepub version}
}
@InProceedings{RankLyricsPoster,
  author = {Shuo Zhang},
  title = {Rank{L}yrics: A ranking-based approach to automatic song lyrics generation},
  year = {2017},
  booktitle = {6th Mid-Atlantic Student Colloquium on Speech, Language, and Learning}, 
  address= {George Washington University, Washington DC}
}
@Article{OdebrechtEtAl2017,
  author    = {Carolin Odebrecht and Malte Belz and Amir Zeldes and Anke L\"{u}deling},
  title     = {{RIDGES} Herbology - Designing a Diachronic Multi-Layer Corpus},
  journal   = {Language Resources and Evaluation},
  year      = {2017},
  volume    = {51},
  number    = {3},
  pages     = {695-725},
  doi       = {10.1007/s10579-016-9374-3},
  url = {https://gucorpling.org/corpling/dx.doi.org/10.1007/s10579-016-9374-3},
  comment = {prepub version},
  url2 = {http://www.readcube.com/articles/10.1007/s10579-016-9374-3},
  comment2 = {read online}
}
@InProceedings{ZeldesSimonson2016,
  author    = {Amir Zeldes and Dan Simonson},
  title     = {Different Flavors of {GUM}: Evaluating Genre and Sentence Type Effects on Multilayer Corpus Annotation Quality},
  booktitle = {Proceedings of LAW X – The 10th Linguistic Annotation Workshop},
  year      = {2016},
  pages     = {68-78},
  address   = {Berlin},
  url = {https://aclweb.org/anthology/W/W16/W16-1709.pdf},
  comment = {paper}
}
@article{Kirtley_etal,
  author = {Joelle Kirtley and Katie Drager and James Grama and Sean Simpson},
  title = {An acoustic analysis of the vowels of Hawai‘i English},
  journal = {Journal of the International Phonetic Association},
  volume = {46},
  number = {1},
  pages={79-97},
  year = {2016},
  url = {https://www.cambridge.org/core/journals/journal-of-the-international-phonetic-association/article/div-classtitlean-acoustic-analysis-of-the-vowels-of-hawaii-englishdiv/3738B709E9967BA01B873DACF6FD7086}
}
@InProceedings{ZeldesSchroeder2016,
  author    = {Amir Zeldes and Caroline T. Schroeder},
  title     = {An {NLP} Pipeline for {C}optic},
  booktitle = {Proceedings of the 10th ACL SIGHUM Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH2016)},
  year      = {2016},
  pages     = {146-155},
  address   = {Berlin},
  url = {http://aclweb.org/anthology/W/W16/W16-2119.pdf},
  comment = {paper}
}
@InProceedings{Zeldes2016b,
  author    = {Amir Zeldes},
  title     = {{rstWeb} - A Browser-based Annotation Interface for Rhetorical Structure Theory and Discourse Relations},
  booktitle = {Proceedings of NAACL-HLT 2016 System Demonstrations},
  year      = {2016},
  pages     = {1-5},
  address   = {San Diego, CA},
  url = {http://aclweb.org/anthology/N/N16/N16-3001.pdf},
  comment = {paper}
}
@InProceedings{ZeldesZhang2016,
  author    = {Amir Zeldes and Shuo Zhang},
  title     = {When Annotation Schemes Change Rules Help: A Configurable Approach to Coreference Resolution beyond {OntoNotes}},
  booktitle = {Proceedings of the NAACL2016 Workshop on Coreference Resolution Beyond OntoNotes (CORBON)},
  year      = {2016},
  pages     = {92-101},
  address   = {San Diego, CA},
  url = {http://aclweb.org/anthology/W/W16/W16-0713.pdf},
  comment = {paper}
}
@InProceedings{Zeldes2016d,
  author    = {Amir Zeldes},
  title     = {A Quantitative Approach to Syntactic Alternations in {S}ahidic},
  booktitle = {11th Congress of the International Association of Coptic Studies (IACS 2016)},
  year      = {2016},
  address   = {Claremont, CA}
}
@InProceedings{W16-2001,
  author = {Shuo Zhang},
  title = {Mining Linguistic Tone Patterns with Symbolic Representation},
  booktitle = {Proceedings of the 14th {SIGMORPHON} Workshop on Computational Research in Phonetics, Phonology, and Morphology},
  year = {2016},
  pages = {1-9},
  address = {Berlin, Germany},
  doi = {https://doi.org/10.18653/v1/W16-2001},
  url = {http://www.aclweb.org/anthology/W16-2001}
}
@InProceedings{simonsondavis2016,
  author    = {Dan Simonson and Anthony Davis},
  title     = {{NASTEA}: Investigating Narrative Schemas through Annotated Entities},
  booktitle = {{Proceedings of the Second Workshop on Computing News Storylines (CNS 2016)
                at EMNLP 2016}},
  year      = {2016},
  address   = {Austin, Texas},
  pages     = {57-66},
  url       = {http://aclweb.org/anthology/W16-5707}
}
@Article{SchroederZeldes2015,
  author    = {Caroline T. Schroeder and Amir Zeldes},
  title     = {Raiders of the Lost Corpus},
  journal   = {Digital Humanities Quarterly},
  year      = {2016},
  volume    = {10},
  number    = {2},
  url       = {http://digitalhumanities.org/dhq/vol/10/2/000247/000247.html},
}
@Article{Zeldes2016c,
  author    = {Amir Zeldes},
  title     = {Probabilistic Pragmatics and Probabilistic Experience},
  journal   = {Zeitschrift f\"{u}r Sprachwissenschaft},
  year      = {2016},
  volume    = {35},
  number    = {1},
  pages     = {109-116},
  doi       = {http://dx.doi.org/10.1515/zfs-2016-0008},
  url = {https://gucorpling.org/amir/pdf/prob_prag_zeldes_2016.pdf},
  comment = {prepub version}
}
@Article{KrauseZeldes2014,
  title                    = {{ANNIS3}: A New Architecture for Generic Corpus Query and Visualization},
  author                   = {Thomas Krause and Amir Zeldes},
  journal                  = {Digital Scholarship in the Humanities},
  voume = {31},
  number = {1},
  pages={118-139},
  year                     = {2016},
  note                     = {2},
  doi                      = {http://dx.doi.org/10.1093/llc/fqu057},
  url                      = {https://gucorpling.org/amir/pdf/ANNIS3_DSH_prepub.pdf},
  comment = {prepub version}
}
@TechReport{ZeldesSchroeder2013,
  title                    = {SCRIPTORIUM Part-of-Speech Tagsets for {S}ahidic {C}optic. Version: 1.2.0_2018.07.03},
  author                   = {Amir Zeldes and Caroline T. Schroeder},
  address              = {Georgetown University and University of the Pacific},
  year                     = {2018},
  note                     = {9},
  url                      = {https://github.com/CopticScriptorium/tagger-part-of-speech/raw/master/scriptorium_tagset_documentation.pdf}
}
@TechReport{ZeldesUDCoptic2016,
  author                   = {Amir Zeldes},
 title = {{C}optic Universal Dependency Guidelines. Version 1.1.0-2016-10-19},
 address= {Georgetown University},
 url = {https://github.com/CopticScriptorium/corpora/raw/master/coptic-treebank/Coptic_universal_dependencies_guidelines.pdf}
}
@InCollection{LuedelingRitzStedeEtAl2016,
  author    = {Anke L\"{u}deling and Julia Ritz and Manfred Stede and Amir Zeldes},
  title     = {Corpus Linguistics and Information Structure Research},
  booktitle = {The Oxford Handbook of Information Structure},
  year      = {2016},
  editor    = {Caroline F\'{e}ry and Shinichiro Ichihara},
  note      = {3},
  publisher = {Oxford University Press},
  pages     = {599-617},
  doi = {https://dx.doi.org/10.1093/oxfordhb/9780199642670.013.013},
  address   = {Oxford}
 }
 @TechReport{CASL_tech_2015,
  author = {Thomas J. Conners and Claudia M. Brugman and Paul Rodrigues and Sean Simpson},
  title = {{Summary of register properties from a corpus of Jakarta Indonesian Twitter}},
  journal = {{Center for Advanced Study of Language}},
  address = {{University of Maryland}},
  year = {2015}
}
@Article{LeeEtAl2015,
  title                    = {{CityU} Corpus of Essay Drafts of {E}nglish Language Learners: A Corpus of Textual Revision in Second Language Writing},
  author                   = {John Lee and Chak Yan Yeung and Amir Zeldes and Marc Reznicek and Anke Lüdeling and Jonathan Webster},
  journal                  = {Language Resources and Evaluation},
  year                     = {2015},
  note                     = {2},
  number                   = {3},
  pages                    = {659-683},
  volume                   = {49},
  doi                      = {http://link.springer.com/article/10.1007/s10579-015-9301-z},
  url                      = {https://gucorpling.org/amir/pdf/annis_cityu_prepub.pdf},
  comment = {prepub version}
}
@inproceedings{simonsondavis2015,
  title={Interactions between Narrative Schemas and Document Categories},
  author={Dan Simonson and Anthony Davis},
  booktitle={{Proceedings of the First Computing News Storylines Workshop (CnewS) at ACL-IJCNLP 2015}},
  pages={1-10},
  year={2015},
  url={http://www.aclweb.org/anthology/W15-4501}
}
@InProceedings{SPMPoster,
  author = {Shuo Zhang},
  title = {Analyze Linguistic Tone Patterns Using Time-Series Data Mining Techniques},
  year = {2015},
  booktitle = {15th Workshop on Computational Phonology and Morphology ({CompMorPhon15})},
  address= {Chicago}
}
@Article{ZeldesSchroeder2015,
  title                    = {Computational Methods for Coptic: Developing and Using Part-of-Speech Tagging for Digital Scholarship in the Humanities},
  author                   = {Amir Zeldes and Caroline T. Schroeder},
  journal                  = {Digital Scholarship in the Humanities},
  year                     = {2015},
  note                     = {2},
  number                   = {1},
  pages                    = {164-176},
  volume                   = {31},
  url                      = {https://gucorpling.org/amir/pdf/Computational_Methods_for_Coptic_prepub.pdf},
  doi = {http://dx.doi.org/10.1093/llc/fqv057},
  comment = {prepub version}
}
@Article{RomaryZeldesZipserer2015,
  title                    = {&lt;tiger2/&gt; - Serialising the ISO SynAF Syntactic Object Model},
  author                   = {Laurent Romary and Amir Zeldes and Florian Zipser},
  journal                  = {Language Resources and Evaluation},
  year                     = {2015},
  note                     = {2},
  pages                    = {1-18},
  volume                   = {49},
  doi                      = {http://dx.doi.org/10.1007/s10579-014-9288-x}
}
@InCollection{PetrovaZeldes2015,
  title                    = {How exceptional is CP recursion in Germanic OV languages? Corpus-based evidence from Middle Low German},
  author                   = {Svetlana Petrova and Amir Zeldes},
  booktitle                = {Historical Corpora: Challenges and Perspectives},
  publisher                = {Narr},
  year                     = {2015},
  address                  = {Tübingen},
  editor                   = {Jost Gippert and Ralf Gehrke},
  note                     = {3},
  pages                    = {151-164},
  series                   = {Korpuslinguistik und interdisziplinäre Perspektiven auf Sprache 5},
  url                      = {https://books.google.com/books?id=7anlBgAAQBAJ&lpg=PP1&pg=PA151#v=onepage&q&f=false},
  comment = {Google Books}
}
@InProceedings{Zeldes2015a,
  title                    = {Duplicitous Diabolos: Parallel Witness Encoding in Quantitative Studies of Coptic Manuscripts},
  author                   = {Amir Zeldes},
  booktitle                = {Proceedings of Balisage, Symposium on Cultural Heritage Markup. Balisage Series on Markup Technologies, vol. 16},
  year                     = {2015},
  address                  = {Washington, DC},
  note                     = {4},
  url                      = {http://www.balisage.net/Proceedings/vol16/print/Zeldes01/BalisageVol16-Zeldes01.html},
  comment = {link}
}
@InProceedings{Zeldes2015,
  title                    = {Tagging the Desert Fathers: Part of Speech Analysis in Sahidic Coptic Corpora},
  author                   = {Amir Zeldes},
  booktitle                = {43rd Annual North American Conference on Afroasiatic Linguistics (NACAL2015), 13-15.2.2015},
  year                     = {2015},
  address                  = {Washington, DC},
  note                     = {4},
}

Publications

Updates

Get in touch