<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-17T05:12:34Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:10230/70025" metadataPrefix="marc">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:10230/70025</identifier><datestamp>2025-12-24T08:36:23Z</datestamp><setSpec>com_2072_6</setSpec><setSpec>col_2072_452952</setSpec></header><metadata><record xmlns="http://www.loc.gov/MARC21/slim" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
   <leader>00925njm 22002777a 4500</leader>
   <datafield ind2=" " ind1=" " tag="042">
      <subfield code="a">dc</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Montesinos García, Juan Felipe</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Kadandale, Venkatesh S.</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Haro Ortega, Gloria</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="260">
      <subfield code="c">2025-03-27T07:24:43Z</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="260">
      <subfield code="c">2025-03-27T07:24:43Z</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="260">
      <subfield code="c">2022</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">This paper presents an audio-visual approach for voice separation which produces state-of-the-art results at a low latency in two scenarios: speech and singing voice. The model is based on a two-stage network. Motion cues are obtained with a lightweight graph convolutional network that processes face landmarks. Then, both audio and motion features are fed to an audio-visual transformer which produces a fairly good estimation of the isolated target source. In a second stage, the predominant voice is enhanced with an audio-only network. We present different ablation studies and comparison to state-of-the-art methods.
Finally, we explore the transferability of models trained for speech separation in the task of singing voice separation. The demos, code, and weights are available in https://ipcv.github.io/VoViT/.</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">We acknowledge support by MICINN/FEDER UE project PID2021-127643NB-I00; H2020-MSCA-RISE-2017 project 777826 NoMADS. J.F.M. acknowledges support by FPI scholarship PRE2018-083920. We acknowledge NVIDIA Corporation for the donation of GPUs used for the experiments.</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Audio-visual</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Source separation</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Speech</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Singing voice</subfield>
   </datafield>
   <datafield ind2="0" ind1="0" tag="245">
      <subfield code="a">VoViT: low latency graph-based audio-visual voice separation transformer</subfield>
   </datafield>
</record></metadata></record></GetRecord></OAI-PMH>