<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-14T03:27:45Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:10230/54140" metadataPrefix="didl">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:10230/54140</identifier><datestamp>2025-12-23T20:52:49Z</datestamp><setSpec>com_2072_6</setSpec><setSpec>col_2072_452954</setSpec></header><metadata><d:DIDL xmlns:d="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" xsi:schemaLocation="urn:mpeg:mpeg21:2002:02-DIDL-NS http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didl.xsd">
   <d:Item id="hdl_10230_54140">
      <d:Descriptor>
         <d:Statement mimeType="application/xml; charset=utf-8">
            <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS" xsi:schemaLocation="urn:mpeg:mpeg21:2002:01-DII-NS http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/dii/dii.xsd">urn:hdl:10230/54140</dii:Identifier>
         </d:Statement>
      </d:Descriptor>
      <d:Descriptor>
         <d:Statement mimeType="application/xml; charset=utf-8">
            <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
               <dc:title>Building a Catalan-Chinese parallel corpus from Wikipedia for use in machine translation</dc:title>
               <dc:creator>Zhou, Chenyue</dc:creator>
               <dc:subject>Parallel corpus</dc:subject>
               <dc:subject>Data mining</dc:subject>
               <dc:subject>Corpus quality</dc:subject>
               <dc:subject>Machine translation</dc:subject>
               <dc:subject>Catalan</dc:subject>
               <dc:subject>Chinese</dc:subject>
               <dc:subject>Low-resource languages</dc:subject>
               <dc:description>Treball de fi de màster en Lingüística Teòrica i Aplicada. Directora: Dra. Maite Melero</dc:description>
               <dc:description>The lack of parallel corpora is one of the biggest challenges hindering progress in&#xd;
Machine Translation for low-resource languages. In this work, we crawl and filter&#xd;
parallel sentences in Catalan and Chinese from Wikipedia in order to compile a&#xd;
parallel corpus of good quality. This paper describes the processes we follow to build&#xd;
the corpus, including mining the text data, computing sentence embeddings,&#xd;
extracting sentence alignment and filtering for better corpus quality. We manually&#xd;
audit the corpus quality based on an error taxonomy. Results show that the automatic&#xd;
filtering we applied makes a great improvement in the quality of our web-crawled&#xd;
corpus. The corpus is later used as training data to finetune a multilingual Machine&#xd;
Translation (MT) system in both CA→ZH and ZH→CA directions. Results show that&#xd;
finetuning with our corpus successfully managed to improve BLEU score in both&#xd;
directions on the Flores-101 public benchmark test sets, which demonstrates the&#xd;
importance of corpus in MT and the quality of our Catalan-Chinese parallel corpus.</dc:description>
               <dc:date>2022-09-21T16:55:53Z</dc:date>
               <dc:date>2022-09-21T16:55:53Z</dc:date>
               <dc:date>2022-09-21</dc:date>
               <dc:type>info:eu-repo/semantics/masterThesis</dc:type>
               <dc:rights>Llicència CC Reconeixement-NoComercial-SenseObraDerivada 4.0 Internacional (CC BY-NC-ND 4.0)</dc:rights>
               <dc:rights>https://creativecommons.org/licenses/by-nc-nd/4.0/</dc:rights>
               <dc:rights>info:eu-repo/semantics/openAccess</dc:rights>
            </oai_dc:dc>
         </d:Statement>
      </d:Descriptor>
   </d:Item>
</d:DIDL></metadata></record></GetRecord></OAI-PMH>