<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-13T02:39:00Z</responseDate><request verb="GetRecord" identifier="oai:www.recercat.cat:2117/123852" metadataPrefix="marc">https://recercat.cat/oai/request</request><GetRecord><record><header><identifier>oai:recercat.cat:2117/123852</identifier><datestamp>2026-01-21T04:13:13Z</datestamp><setSpec>com_2072_1033</setSpec><setSpec>col_2072_452950</setSpec></header><metadata><record xmlns="http://www.loc.gov/MARC21/slim" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:doc="http://www.lyncode.com/xoai" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
   <leader>00925njm 22002777a 4500</leader>
   <datafield ind2=" " ind1=" " tag="042">
      <subfield code="a">dc</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Bonafonte Cávez, Antonio</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Pascual de la Puente, Santiago</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="720">
      <subfield code="a">Dorca, G.</subfield>
      <subfield code="e">author</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="260">
      <subfield code="c">2018</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">During the 2000s decade, unit-selection based text-to-speech was the dominant commercial technology. Meanwhile, the TTS research community has made a big effort to push statistical-parametric speech synthesis to get similar quality and more flexibility on the synthetically generated voice. During last years, deep learning advances applied to speech synthesis have filled the gap, specially when neural vocoders substitute traditional signal-processing based vocoders. In this paper we propose to substitute the waveform generation vocoder of MUSA, our Spanish TTS, with SampleRNN, a neural vocoder which was recently proposed as a deep autoregressive raw waveform generation model. MUSA uses recurrent neural networks to predict vocoder parameters (MFCC and logF0) from linguistic features. Then, the Ahocoder vocoder is used to recover the speech waveform out of the predicted parameters. In the first system SampleRNN is extended to generate speech conditioned on the Ahocoder generated parameters (mfcc and logF0), where two configurations have been considered to train the system. First, the parameters derived from the signal using Ahocoder are used. Secondly, the system is trained with the parameters predicted by MUSA, where SampleRNN and MUSA are jointly optimized. The subjective evaluation shows that the second system outperforms both the original Ahocoder and SampleRNN as an independent neural vocoder.</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">Peer Reviewed</subfield>
   </datafield>
   <datafield ind2=" " ind1=" " tag="520">
      <subfield code="a">Postprint (published version)</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Àrees temàtiques de la UPC::Enginyeria de la telecomunicació::Processament del senyal::Processament de la parla i del senyal acústic</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Automatic speech recognition</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Neural vocoder</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">SampleRNN</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Spanish TTS</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">SPSS</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Deep learning</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Linguistics</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Signal processing</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Speech communication</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Speech synthesis</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Vocoders</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Commercial technology</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Research communities</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">SampleRNN</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Spanish TTS</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">SPSS</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Statistical parametric speech synthesis</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Subjective evaluations</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Waveform generation</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Recurrent neural networks</subfield>
   </datafield>
   <datafield tag="653" ind2=" " ind1=" ">
      <subfield code="a">Reconeixement automàtic de la parla</subfield>
   </datafield>
   <datafield ind2="0" ind1="0" tag="245">
      <subfield code="a">Spanish statistical parametric speech synthesis using a neural vocoder</subfield>
   </datafield>
</record></metadata></record></GetRecord></OAI-PMH>