<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-19T19:17:36Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:2117/429635" metadataPrefix="marc">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:2117/429635</identifier><datestamp>2025-07-23T02:56:38Z</datestamp><setSpec>com_2072_1033</setSpec><setSpec>col_2072_452951</setSpec></header><metadata><record xmlns="http://www.loc.gov/MARC21/slim" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
   <leader>00925njm 22002777a 4500</leader>
   <datafield ind2=" " ind1=" " tag="042">
      <subfield code="a">dc</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Cabré Guerrero, Víctor</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="260">
      <subfield code="c">2025-01-16</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">Language identification is a fundamental Natural Language Processing (NLP) task with wide-ranging applications, from machine translation to preprocessing training data for Large Language Models. While existing models demonstrate high performance within specific domains, their effectiveness significantly deteriorates when applied across different linguistic contexts. This study presents an in-depth analysis of cross-domain language identification methods, evaluating their performance, capabilities, and inherent limitations. Our research investigates the challenges posed by the linguistic diversity found in modern communication. Experiments conducted on a dataset spanning 2,034 languages reveal significant performance variations across domains. Models trained on specific domains like wiki, news, and religious texts show high in-domain accuracy but struggle to maintain performance when applied to different linguistic contexts. Our analysis highlights the need for more adaptable, context-aware language identification systems that can effectively handle the complexity of modern language use. Key findings include the limited transferability of domain-specific features, the nuanced challenges of advanced tokenization, and the complex error patterns arising from language similarities and data inconsistencies. This research contributes to the ongoing dialogue about developing more robust language identification technologies that can adapt to our increasingly diverse linguistic landscape.</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Àrees temàtiques de la UPC::Informàtica::Intel·ligència artificial::Aprenentatge automàtic</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Natural language processing (Computer science))</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Machine learning</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Linguistics</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">langid</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">language</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">identification</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">langid.py</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">glot500</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">identificacio</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">de</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">llenguatges</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">intelligencia</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">artificial</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">aprenentatge</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">automatic</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">fasttext</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">textcat</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">intelligence</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">machine</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">learning</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Tractament del llenguatge natural (Informàtica)</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Aprenentatge automàtic</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Lingüística</subfield>
   </datafield>
   <datafield ind2="0" ind1="0" tag="245">
      <subfield code="a">In-depth evaluation of cross-domain language identification methods</subfield>
   </datafield>
</record></metadata></record></GetRecord></OAI-PMH>