@inproceedings{kosgi-etal-2022-empathic,
title = "Empathic Machines: Using Intermediate Features as Levers to Emulate Emotions in Text-To-Speech Systems",
author = "Kosgi, Saiteja and
Sivaprasad, Sarath and
Pedanekar, Niranjan and
Nelakanti, Anil and
Gandhi, Vineet",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.26",
doi = "10.18653/v1/2022.naacl-main.26",
pages = "336--347",
abstract = "We present a method to control the emotional prosody of Text to Speech (TTS) systems by using phoneme-level intermediate features (pitch, energy, and duration) as levers. As a key idea, we propose Differential Scaling (DS) to disentangle features relating to affective prosody from those arising due to acoustics conditions and speaker identity. With thorough experimental studies, we show that the proposed method improves over the prior art in accurately emulating the desired emotions while retaining the naturalness of speech. We extend the traditional evaluation of using individual sentences for a more complete evaluation of HCI systems. We present a novel experimental setup by replacing an actor with a TTS system in offline and live conversations. The emotion to be rendered is either predicted or manually assigned. The results show that the proposed method is strongly preferred over the state-of-the-art TTS system and adds the much-coveted {``}human touch{''} in machine dialogue. Audio samples from our experiments and the code are available at: \url{https://emtts.github.io/tts-demo/}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kosgi-etal-2022-empathic">
<titleInfo>
<title>Empathic Machines: Using Intermediate Features as Levers to Emulate Emotions in Text-To-Speech Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saiteja</namePart>
<namePart type="family">Kosgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarath</namePart>
<namePart type="family">Sivaprasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="family">Pedanekar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Nelakanti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vineet</namePart>
<namePart type="family">Gandhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a method to control the emotional prosody of Text to Speech (TTS) systems by using phoneme-level intermediate features (pitch, energy, and duration) as levers. As a key idea, we propose Differential Scaling (DS) to disentangle features relating to affective prosody from those arising due to acoustics conditions and speaker identity. With thorough experimental studies, we show that the proposed method improves over the prior art in accurately emulating the desired emotions while retaining the naturalness of speech. We extend the traditional evaluation of using individual sentences for a more complete evaluation of HCI systems. We present a novel experimental setup by replacing an actor with a TTS system in offline and live conversations. The emotion to be rendered is either predicted or manually assigned. The results show that the proposed method is strongly preferred over the state-of-the-art TTS system and adds the much-coveted “human touch” in machine dialogue. Audio samples from our experiments and the code are available at: https://emtts.github.io/tts-demo/</abstract>
<identifier type="citekey">kosgi-etal-2022-empathic</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-main.26</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-main.26</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>336</start>
<end>347</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Empathic Machines: Using Intermediate Features as Levers to Emulate Emotions in Text-To-Speech Systems
%A Kosgi, Saiteja
%A Sivaprasad, Sarath
%A Pedanekar, Niranjan
%A Nelakanti, Anil
%A Gandhi, Vineet
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F kosgi-etal-2022-empathic
%X We present a method to control the emotional prosody of Text to Speech (TTS) systems by using phoneme-level intermediate features (pitch, energy, and duration) as levers. As a key idea, we propose Differential Scaling (DS) to disentangle features relating to affective prosody from those arising due to acoustics conditions and speaker identity. With thorough experimental studies, we show that the proposed method improves over the prior art in accurately emulating the desired emotions while retaining the naturalness of speech. We extend the traditional evaluation of using individual sentences for a more complete evaluation of HCI systems. We present a novel experimental setup by replacing an actor with a TTS system in offline and live conversations. The emotion to be rendered is either predicted or manually assigned. The results show that the proposed method is strongly preferred over the state-of-the-art TTS system and adds the much-coveted “human touch” in machine dialogue. Audio samples from our experiments and the code are available at: https://emtts.github.io/tts-demo/
%R 10.18653/v1/2022.naacl-main.26
%U https://aclanthology.org/2022.naacl-main.26
%U https://doi.org/10.18653/v1/2022.naacl-main.26
%P 336-347
Markdown (Informal)
[Empathic Machines: Using Intermediate Features as Levers to Emulate Emotions in Text-To-Speech Systems](https://aclanthology.org/2022.naacl-main.26) (Kosgi et al., NAACL 2022)
ACL