<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-18T00:21:13Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:10230/70025" metadataPrefix="didl">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:10230/70025</identifier><datestamp>2025-12-24T08:36:23Z</datestamp><setSpec>com_2072_6</setSpec><setSpec>col_2072_452952</setSpec></header><metadata><d:DIDL xmlns:d="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" xsi:schemaLocation="urn:mpeg:mpeg21:2002:02-DIDL-NS http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didl.xsd">
   <d:Item id="hdl_10230_70025">
      <d:Descriptor>
         <d:Statement mimeType="application/xml; charset=utf-8">
            <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS" xsi:schemaLocation="urn:mpeg:mpeg21:2002:01-DII-NS http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/dii/dii.xsd">urn:hdl:10230/70025</dii:Identifier>
         </d:Statement>
      </d:Descriptor>
      <d:Descriptor>
         <d:Statement mimeType="application/xml; charset=utf-8">
            <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
               <dc:title>VoViT: low latency graph-based audio-visual voice separation transformer</dc:title>
               <dc:creator>Montesinos García, Juan Felipe</dc:creator>
               <dc:creator>Kadandale, Venkatesh S.</dc:creator>
               <dc:creator>Haro Ortega, Gloria</dc:creator>
               <dc:subject>Audio-visual</dc:subject>
               <dc:subject>Source separation</dc:subject>
               <dc:subject>Speech</dc:subject>
               <dc:subject>Singing voice</dc:subject>
               <dc:description>This paper presents an audio-visual approach for voice separation which produces state-of-the-art results at a low latency in two scenarios: speech and singing voice. The model is based on a two-stage network. Motion cues are obtained with a lightweight graph convolutional network that processes face landmarks. Then, both audio and motion features are fed to an audio-visual transformer which produces a fairly good estimation of the isolated target source. In a second stage, the predominant voice is enhanced with an audio-only network. We present different ablation studies and comparison to state-of-the-art methods.
Finally, we explore the transferability of models trained for speech separation in the task of singing voice separation. The demos, code, and weights are available in https://ipcv.github.io/VoViT/.</dc:description>
               <dc:description>We acknowledge support by MICINN/FEDER UE project PID2021-127643NB-I00; H2020-MSCA-RISE-2017 project 777826 NoMADS. J.F.M. acknowledges support by FPI scholarship PRE2018-083920. We acknowledge NVIDIA Corporation for the donation of GPUs used for the experiments.</dc:description>
               <dc:date>2025-03-27T07:24:43Z</dc:date>
               <dc:date>2025-03-27T07:24:43Z</dc:date>
               <dc:date>2022</dc:date>
               <dc:type>info:eu-repo/semantics/conferenceObject</dc:type>
               <dc:type>info:eu-repo/semantics/acceptedVersion</dc:type>
               <dc:relation>Avidan S, Brostow G, Cissé M, Maria Farinella G, Hassner T, editors. 17th European Conference on Computer Vision Part XVIII (ECCV 2022); 2022 October 23-7; Tel Aviv, Israel. Cham: Springer Verlag; 2022. p.310-26. (LNCS; no. 13678). DOI: 10.1007/978-3-031-19836-6_18</dc:relation>
               <dc:relation>info:eu-repo/grantAgreement/ES/3PE/PID2021-127643</dc:relation>
               <dc:relation>info:eu-repo/grantAgreement/EC/HE/777826</dc:relation>
               <dc:rights>© The Author(s), under exclusive license to Springer Nature Switzerland AG 2022 S. Avidan et al. (Eds.): ECCV 2022, LNCS 13697, pp. 310–326, 2022. https://doi.org/10.1007/978-3-031-19836-6_18</dc:rights>
               <dc:rights>info:eu-repo/semantics/openAccess</dc:rights>
               <dc:publisher>Springer</dc:publisher>
            </oai_dc:dc>
         </d:Statement>
      </d:Descriptor>
   </d:Item>
</d:DIDL></metadata></record></GetRecord></OAI-PMH>