<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-17T14:15:37Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:10230/54140" metadataPrefix="mets">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:10230/54140</identifier><datestamp>2025-12-23T20:52:49Z</datestamp><setSpec>com_2072_6</setSpec><setSpec>col_2072_452954</setSpec></header><metadata><mets xmlns="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" ID="&#xa;&#x9;&#x9;&#x9;&#x9;DSpace_ITEM_10230-54140" TYPE="DSpace ITEM" PROFILE="DSpace METS SIP Profile 1.0" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd" OBJID="&#xa;&#x9;&#x9;&#x9;&#x9;hdl:10230/54140">
   <metsHdr CREATEDATE="2026-04-17T14:01:46Z">
      <agent ROLE="CUSTODIAN" TYPE="ORGANIZATION">
         <name>RECERCAT</name>
      </agent>
   </metsHdr>
   <dmdSec ID="DMD_10230_54140">
      <mdWrap MDTYPE="MODS">
         <xmlData xmlns:mods="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-1.xsd">
            <mods:mods xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-1.xsd">
               <mods:name>
                  <mods:role>
                     <mods:roleTerm type="text">author</mods:roleTerm>
                  </mods:role>
                  <mods:namePart>Zhou, Chenyue</mods:namePart>
               </mods:name>
               <mods:originInfo>
                  <mods:dateIssued encoding="iso8601">2022-09-21T16:55:53Z2022-09-21T16:55:53Z2022-09-21</mods:dateIssued>
               </mods:originInfo>
               <mods:identifier type="none"/>
               <mods:abstract>Treball de fi de màster en Lingüística Teòrica i Aplicada. Directora: Dra. Maite MeleroThe lack of parallel corpora is one of the biggest challenges hindering progress in&#xd;
Machine Translation for low-resource languages. In this work, we crawl and filter&#xd;
parallel sentences in Catalan and Chinese from Wikipedia in order to compile a&#xd;
parallel corpus of good quality. This paper describes the processes we follow to build&#xd;
the corpus, including mining the text data, computing sentence embeddings,&#xd;
extracting sentence alignment and filtering for better corpus quality. We manually&#xd;
audit the corpus quality based on an error taxonomy. Results show that the automatic&#xd;
filtering we applied makes a great improvement in the quality of our web-crawled&#xd;
corpus. The corpus is later used as training data to finetune a multilingual Machine&#xd;
Translation (MT) system in both CA→ZH and ZH→CA directions. Results show that&#xd;
finetuning with our corpus successfully managed to improve BLEU score in both&#xd;
directions on the Flores-101 public benchmark test sets, which demonstrates the&#xd;
importance of corpus in MT and the quality of our Catalan-Chinese parallel corpus.</mods:abstract>
               <mods:language>
                  <mods:languageTerm authority="rfc3066"/>
               </mods:language>
               <mods:accessCondition type="useAndReproduction">Llicència CC Reconeixement-NoComercial-SenseObraDerivada 4.0 Internacional (CC BY-NC-ND 4.0) https://creativecommons.org/licenses/by-nc-nd/4.0/ info:eu-repo/semantics/openAccess</mods:accessCondition>
               <mods:subject>
                  <mods:topic>Parallel corpus</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Data mining</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Corpus quality</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Machine translation</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Catalan</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Chinese</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Low-resource languages</mods:topic>
               </mods:subject>
               <mods:titleInfo>
                  <mods:title>Building a Catalan-Chinese parallel corpus from Wikipedia for use in machine translation</mods:title>
               </mods:titleInfo>
               <mods:genre>info:eu-repo/semantics/masterThesis</mods:genre>
            </mods:mods>
         </xmlData>
      </mdWrap>
   </dmdSec>
   <structMap LABEL="DSpace Object" TYPE="LOGICAL">
      <div TYPE="DSpace Object Contents" ADMID="DMD_10230_54140"/>
   </structMap>
</mets></metadata></record></GetRecord></OAI-PMH>