<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-17T00:57:35Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:10230/70025" metadataPrefix="mets">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:10230/70025</identifier><datestamp>2025-12-24T08:36:23Z</datestamp><setSpec>com_2072_6</setSpec><setSpec>col_2072_452952</setSpec></header><metadata><mets xmlns="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" ID="&#xa;&#x9;&#x9;&#x9;&#x9;DSpace_ITEM_10230-70025" TYPE="DSpace ITEM" PROFILE="DSpace METS SIP Profile 1.0" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd" OBJID="&#xa;&#x9;&#x9;&#x9;&#x9;hdl:10230/70025">
   <metsHdr CREATEDATE="2026-04-17T02:57:35Z">
      <agent ROLE="CUSTODIAN" TYPE="ORGANIZATION">
         <name>RECERCAT</name>
      </agent>
   </metsHdr>
   <dmdSec ID="DMD_10230_70025">
      <mdWrap MDTYPE="MODS">
         <xmlData xmlns:mods="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-1.xsd">
            <mods:mods xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-1.xsd">
               <mods:name>
                  <mods:role>
                     <mods:roleTerm type="text">author</mods:roleTerm>
                  </mods:role>
                  <mods:namePart>Montesinos García, Juan Felipe</mods:namePart>
               </mods:name>
               <mods:name>
                  <mods:role>
                     <mods:roleTerm type="text">author</mods:roleTerm>
                  </mods:role>
                  <mods:namePart>Kadandale, Venkatesh S.</mods:namePart>
               </mods:name>
               <mods:name>
                  <mods:role>
                     <mods:roleTerm type="text">author</mods:roleTerm>
                  </mods:role>
                  <mods:namePart>Haro Ortega, Gloria</mods:namePart>
               </mods:name>
               <mods:originInfo>
                  <mods:dateIssued encoding="iso8601">2025-03-27T07:24:43Z2025-03-27T07:24:43Z2022</mods:dateIssued>
               </mods:originInfo>
               <mods:identifier type="none"/>
               <mods:abstract>This paper presents an audio-visual approach for voice separation which produces state-of-the-art results at a low latency in two scenarios: speech and singing voice. The model is based on a two-stage network. Motion cues are obtained with a lightweight graph convolutional network that processes face landmarks. Then, both audio and motion features are fed to an audio-visual transformer which produces a fairly good estimation of the isolated target source. In a second stage, the predominant voice is enhanced with an audio-only network. We present different ablation studies and comparison to state-of-the-art methods.
Finally, we explore the transferability of models trained for speech separation in the task of singing voice separation. The demos, code, and weights are available in https://ipcv.github.io/VoViT/.We acknowledge support by MICINN/FEDER UE project PID2021-127643NB-I00; H2020-MSCA-RISE-2017 project 777826 NoMADS. J.F.M. acknowledges support by FPI scholarship PRE2018-083920. We acknowledge NVIDIA Corporation for the donation of GPUs used for the experiments.</mods:abstract>
               <mods:language>
                  <mods:languageTerm authority="rfc3066"/>
               </mods:language>
               <mods:accessCondition type="useAndReproduction">© The Author(s), under exclusive license to Springer Nature Switzerland AG 2022 S. Avidan et al. (Eds.): ECCV 2022, LNCS 13697, pp. 310–326, 2022. https://doi.org/10.1007/978-3-031-19836-6_18 info:eu-repo/semantics/openAccess</mods:accessCondition>
               <mods:subject>
                  <mods:topic>Audio-visual</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Source separation</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Speech</mods:topic>
               </mods:subject>
               <mods:subject>
                  <mods:topic>Singing voice</mods:topic>
               </mods:subject>
               <mods:titleInfo>
                  <mods:title>VoViT: low latency graph-based audio-visual voice separation transformer</mods:title>
               </mods:titleInfo>
               <mods:genre>info:eu-repo/semantics/conferenceObject info:eu-repo/semantics/acceptedVersion</mods:genre>
            </mods:mods>
         </xmlData>
      </mdWrap>
   </dmdSec>
   <structMap LABEL="DSpace Object" TYPE="LOGICAL">
      <div TYPE="DSpace Object Contents" ADMID="DMD_10230_70025"/>
   </structMap>
</mets></metadata></record></GetRecord></OAI-PMH>