diff --git a/audio/text_to_speech/README.md b/audio/text_to_speech/README.md new file mode 100644 index 00000000..a614aa81 --- /dev/null +++ b/audio/text_to_speech/README.md @@ -0,0 +1,16 @@ +# Text-to-Speech Demo + +This is a demo showing text-to-speech support. + +Language: GDScript + +Renderer: Compatibility + +## How does it work? + +It uses `tts_*()` methods of the [`DisplayServer`](https://docs.godotengine.org/en/latest/classes/class_displayserver.html) singleton +to enumerate voice information, send utterances to the OS TTS API, and receive callback signals. + +## Screenshots + +![Screenshot](screenshots/text_to_speech.webp) diff --git a/audio/text_to_speech/control.tscn b/audio/text_to_speech/control.tscn new file mode 100644 index 00000000..08b1df71 --- /dev/null +++ b/audio/text_to_speech/control.tscn @@ -0,0 +1,244 @@ +[gd_scene load_steps=2 format=3 uid="uid://u5emvyeyodyh"] + +[ext_resource type="Script" path="res://voice_list.gd" id="1_0bwjs"] + +[node name="Control" type="Control"] +layout_mode = 3 +anchors_preset = 8 +anchor_left = 0.5 +anchor_top = 0.5 +anchor_right = 0.5 +anchor_bottom = 0.5 +offset_left = -576.0 +offset_top = -312.0 +offset_right = -576.0 +offset_bottom = -312.0 +grow_horizontal = 2 +grow_vertical = 2 +size_flags_horizontal = 4 +size_flags_vertical = 4 +script = ExtResource("1_0bwjs") + +[node name="LineEditFilterLang" type="LineEdit" parent="."] +layout_mode = 0 +offset_left = 416.0 +offset_top = 304.0 +offset_right = 704.0 +offset_bottom = 337.0 +theme_override_font_sizes/font_size = 16 +placeholder_text = "Language" + +[node name="LineEditFilterName" type="LineEdit" parent="."] +layout_mode = 0 +offset_left = 96.0 +offset_top = 304.0 +offset_right = 408.0 +offset_bottom = 337.0 +theme_override_font_sizes/font_size = 16 +placeholder_text = "Name" + +[node name="Label" type="Label" parent="LineEditFilterName"] +layout_mode = 0 +offset_left = -76.0 +offset_top = 3.0 +offset_right = -20.0 +offset_bottom = 35.0 +text = "Filter:" + +[node name="Tree" type="Tree" parent="."] +layout_mode = 0 +offset_left = 16.0 +offset_top = 56.0 +offset_right = 704.0 +offset_bottom = 296.0 +columns = 2 + +[node name="Utterance" type="TextEdit" parent="."] +layout_mode = 0 +offset_left = 264.0 +offset_top = 472.0 +offset_right = 704.0 +offset_bottom = 584.0 +theme_override_font_sizes/font_size = 16 +text = "Beware the Jabberwock, my son! The jaws that bite, the claws that catch!" +wrap_mode = 1 +draw_spaces = true + +[node name="ButtonSpeak" type="Button" parent="."] +layout_mode = 0 +offset_left = 16.0 +offset_top = 472.0 +offset_right = 128.0 +offset_bottom = 504.0 +text = "Speak" + +[node name="ButtonIntSpeak" type="Button" parent="."] +layout_mode = 0 +offset_left = 144.0 +offset_top = 472.0 +offset_right = 256.0 +offset_bottom = 504.0 +text = "Interrupt" + +[node name="ButtonStop" type="Button" parent="."] +layout_mode = 0 +offset_left = 16.0 +offset_top = 512.0 +offset_right = 128.0 +offset_bottom = 544.0 +text = "Stop" + +[node name="ButtonPause" type="Button" parent="."] +layout_mode = 0 +offset_left = 144.0 +offset_top = 512.0 +offset_right = 256.0 +offset_bottom = 544.0 +toggle_mode = true +text = "Pause" + +[node name="HSliderRate" type="HSlider" parent="."] +layout_mode = 0 +offset_left = 96.0 +offset_top = 352.0 +offset_right = 440.0 +offset_bottom = 368.0 +min_value = 0.1 +max_value = 10.0 +step = 0.05 +value = 1.0 +exp_edit = true + +[node name="Label" type="Label" parent="HSliderRate"] +layout_mode = 0 +offset_left = -76.0 +offset_top = -5.0 +offset_right = -20.0 +offset_bottom = 27.0 +text = "Rate:" + +[node name="Value" type="Label" parent="HSliderRate"] +layout_mode = 0 +offset_left = 352.0 +offset_top = -8.0 +offset_right = 416.0 +offset_bottom = 24.0 +text = "1.00x" + +[node name="HSliderPitch" type="HSlider" parent="."] +layout_mode = 0 +offset_left = 96.0 +offset_top = 392.0 +offset_right = 440.0 +offset_bottom = 408.0 +max_value = 2.0 +step = 0.05 +value = 1.0 + +[node name="Label" type="Label" parent="HSliderPitch"] +layout_mode = 0 +offset_left = -76.0 +offset_top = -5.0 +offset_right = -28.0 +offset_bottom = 27.0 +text = "Pitch:" + +[node name="Value" type="Label" parent="HSliderPitch"] +layout_mode = 0 +offset_left = 352.0 +offset_top = -8.0 +offset_right = 416.0 +offset_bottom = 24.0 +text = "1.00x" + +[node name="HSliderVolume" type="HSlider" parent="."] +layout_mode = 0 +offset_left = 96.0 +offset_top = 432.0 +offset_right = 440.0 +offset_bottom = 448.0 +min_value = 1.0 +value = 50.0 + +[node name="Label" type="Label" parent="HSliderVolume"] +layout_mode = 0 +offset_left = -76.0 +offset_top = -5.0 +offset_right = -12.0 +offset_bottom = 27.0 +text = "Volume:" + +[node name="Value" type="Label" parent="HSliderVolume"] +layout_mode = 0 +offset_left = 352.0 +offset_top = -8.0 +offset_right = 416.0 +offset_bottom = 24.0 +text = "50%" + +[node name="ColorRect" type="ColorRect" parent="."] +layout_mode = 0 +offset_left = 16.0 +offset_top = 16.0 +offset_right = 144.0 +offset_bottom = 40.0 + +[node name="Label" type="Label" parent="ColorRect"] +layout_mode = 0 +offset_right = 128.0 +offset_bottom = 32.0 +theme_override_font_sizes/font_size = 16 +text = "Speaking..." + +[node name="Log" type="TextEdit" parent="."] +layout_mode = 0 +offset_left = 712.0 +offset_top = 56.0 +offset_right = 1138.0 +offset_bottom = 584.0 +editable = false +context_menu_enabled = false +shortcut_keys_enabled = false +virtual_keyboard_enabled = false +middle_mouse_paste_enabled = false + +[node name="ButtonClearLog" type="Button" parent="Log"] +layout_mode = 0 +offset_left = 346.0 +offset_top = 8.0 +offset_right = 418.0 +offset_bottom = 39.0 +theme_override_font_sizes/font_size = 16 +text = "Clear" + +[node name="RichTextLabel" type="RichTextLabel" parent="."] +layout_mode = 0 +offset_left = 152.0 +offset_top = 16.0 +offset_right = 1008.0 +offset_bottom = 40.0 +theme_override_font_sizes/normal_font_size = 16 +bbcode_enabled = true +scroll_active = false + +[node name="ButtonDemo" type="Button" parent="."] +layout_mode = 0 +offset_left = 16.0 +offset_top = 552.0 +offset_right = 256.0 +offset_bottom = 581.0 +theme_override_font_sizes/font_size = 16 +text = "Demo" + +[connection signal="text_changed" from="LineEditFilterLang" to="." method="_on_LineEditFilterName_text_changed"] +[connection signal="text_changed" from="LineEditFilterName" to="." method="_on_LineEditFilterName_text_changed"] +[connection signal="item_activated" from="Tree" to="." method="_on_ItemList_item_activated"] +[connection signal="pressed" from="ButtonSpeak" to="." method="_on_ButtonSpeak_pressed"] +[connection signal="pressed" from="ButtonIntSpeak" to="." method="_on_ButtonIntSpeak_pressed"] +[connection signal="pressed" from="ButtonStop" to="." method="_on_ButtonStop_pressed"] +[connection signal="pressed" from="ButtonPause" to="." method="_on_ButtonPause_pressed"] +[connection signal="value_changed" from="HSliderRate" to="." method="_on_HSliderRate_value_changed"] +[connection signal="value_changed" from="HSliderPitch" to="." method="_on_HSliderPitch_value_changed"] +[connection signal="value_changed" from="HSliderVolume" to="." method="_on_HSliderVolume_value_changed"] +[connection signal="pressed" from="Log/ButtonClearLog" to="." method="_on_ButtonClearLog_pressed"] +[connection signal="pressed" from="ButtonDemo" to="." method="_on_Button_pressed"] diff --git a/audio/text_to_speech/icon.png b/audio/text_to_speech/icon.png new file mode 100644 index 00000000..307ced8e Binary files /dev/null and b/audio/text_to_speech/icon.png differ diff --git a/audio/text_to_speech/icon.png.import b/audio/text_to_speech/icon.png.import new file mode 100644 index 00000000..2d716012 --- /dev/null +++ b/audio/text_to_speech/icon.png.import @@ -0,0 +1,34 @@ +[remap] + +importer="texture" +type="CompressedTexture2D" +uid="uid://53lrswe56fov" +path="res://.godot/imported/icon.png-487276ed1e3a0c39cad0279d744ee560.ctex" +metadata={ +"vram_texture": false +} + +[deps] + +source_file="res://icon.png" +dest_files=["res://.godot/imported/icon.png-487276ed1e3a0c39cad0279d744ee560.ctex"] + +[params] + +compress/mode=0 +compress/high_quality=false +compress/lossy_quality=0.7 +compress/hdr_compression=1 +compress/normal_map=0 +compress/channel_pack=0 +mipmaps/generate=false +mipmaps/limit=-1 +roughness/mode=0 +roughness/src_normal="" +process/fix_alpha_border=true +process/premult_alpha=false +process/normal_map_invert_y=false +process/hdr_as_srgb=false +process/hdr_clamp_exposure=false +process/size_limit=0 +detect_3d/compress_to=1 diff --git a/audio/text_to_speech/project.godot b/audio/text_to_speech/project.godot new file mode 100644 index 00000000..77896226 --- /dev/null +++ b/audio/text_to_speech/project.godot @@ -0,0 +1,28 @@ +; Engine configuration file. +; It's best edited using the editor UI and not directly, +; since the parameters that go here are not all obvious. +; +; Format: +; [section] ; section goes between [] +; param=value ; assign values to parameters + +config_version=5 + +[application] + +config/name="Text-to-speech demo" +config/description="This is a demo showing text-to-speech support." +run/main_scene="res://control.tscn" +config/features=PackedStringArray("4.0") +config/icon="res://icon.png" + +[display] + +window/stretch/mode="canvas_items" +window/stretch/aspect="expand" + +[rendering] + +renderer/rendering_method="gl_compatibility" +renderer/rendering_method.mobile="gl_compatibility" +environment/defaults/default_clear_color=Color(0.2, 0.2, 0.2, 1) diff --git a/audio/text_to_speech/screenshots/.gdignore b/audio/text_to_speech/screenshots/.gdignore new file mode 100644 index 00000000..e69de29b diff --git a/audio/text_to_speech/screenshots/text_to_speech.webp b/audio/text_to_speech/screenshots/text_to_speech.webp new file mode 100644 index 00000000..a1295b88 Binary files /dev/null and b/audio/text_to_speech/screenshots/text_to_speech.webp differ diff --git a/audio/text_to_speech/voice_list.gd b/audio/text_to_speech/voice_list.gd new file mode 100644 index 00000000..320fa8f3 --- /dev/null +++ b/audio/text_to_speech/voice_list.gd @@ -0,0 +1,127 @@ +extends Control + +var id = 0 #utterance id +var ut_map = {} +var vs + +func _ready(): + # get voice data + vs = DisplayServer.tts_get_voices() + var root = $Tree.create_item() + $Tree.set_hide_root(true) + $Tree.set_column_title(0, "Name") + $Tree.set_column_title(1, "Language") + $Tree.set_column_titles_visible(true) + for v in vs: + var child = $Tree.create_item(root) + child.set_text(0, v["name"]) + child.set_metadata(0, v["id"]) + child.set_text(1, v["language"]) + $Log.text += "%d voices available\n" % [vs.size()] + $Log.text += "=======\n" + + # add callbacks + DisplayServer.tts_set_utterance_callback(DisplayServer.TTS_UTTERANCE_STARTED, Callable(self, "_on_utterance_start")) + DisplayServer.tts_set_utterance_callback(DisplayServer.TTS_UTTERANCE_ENDED, Callable(self, "_on_utterance_end")) + DisplayServer.tts_set_utterance_callback(DisplayServer.TTS_UTTERANCE_CANCELED, Callable(self, "_on_utterance_error")) + DisplayServer.tts_set_utterance_callback(DisplayServer.TTS_UTTERANCE_BOUNDARY, Callable(self, "_on_utterance_boundary")) + set_process(true) + +func _process(delta): + $ButtonPause.button_pressed = DisplayServer.tts_is_paused() + if DisplayServer.tts_is_speaking(): + $ColorRect.color = Color(1, 0, 0) + else: + $ColorRect.color = Color(1, 1, 1) + +func _on_utterance_boundary(pos, id): + $RichTextLabel.text = "[bgcolor=yellow][color=black]" + ut_map[id].substr(0, pos) + "[/color][/bgcolor]" + ut_map[id].substr(pos, -1) + +func _on_utterance_start(id): + $Log.text += "utterance %d started\n" % [id] + +func _on_utterance_end(id): + $RichTextLabel.text = "[bgcolor=yellow][color=black]" + ut_map[id] + "[/color][/bgcolor]" + $Log.text += "utterance %d ended\n" % [id] + ut_map.erase(id) + +func _on_utterance_error(id): + $RichTextLabel.text = "" + $Log.text += "utterance %d canceled/failed\n" % [id] + ut_map.erase(id) + +func _on_ButtonStop_pressed(): + DisplayServer.tts_stop() + +func _on_ButtonPause_pressed(): + if $ButtonPause.pressed: + DisplayServer.tts_pause() + else: + DisplayServer.tts_resume() + +func _on_ButtonSpeak_pressed(): + if $Tree.get_selected(): + $Log.text += "utterance %d queried\n" % [id] + ut_map[id] = $Utterance.text + DisplayServer.tts_speak($Utterance.text, $Tree.get_selected().get_metadata(0), $HSliderVolume.value, $HSliderPitch.value, $HSliderRate.value, id, false) + id += 1 + else: + OS.alert("No voice selected.\nSelect a voice in the list, then try using Speak again.") + +func _on_ButtonIntSpeak_pressed(): + if $Tree.get_selected(): + $Log.text += "utterance %d interrupt\n" % [id] + ut_map[id] = $Utterance.text + DisplayServer.tts_speak($Utterance.text, $Tree.get_selected().get_metadata(0), $HSliderVolume.value, $HSliderPitch.value, $HSliderRate.value, id, true) + id += 1 + else: + OS.alert("No voice selected.\nSelect a voice in the list, then try using Interrupt again.") + +func _on_ButtonClearLog_pressed(): + $Log.text = "" + +func _on_HSliderRate_value_changed(value): + $HSliderRate/Value.text = "%.2fx" % [value] + +func _on_HSliderPitch_value_changed(value): + $HSliderPitch/Value.text = "%.2fx" % [value] + +func _on_HSliderVolume_value_changed(value): + $HSliderVolume/Value.text = "%d%%" % [value] + +func _on_Button_pressed(): + var vc + #demo - en + vc = DisplayServer.tts_get_voices_for_language("en") + if !vc.is_empty(): + ut_map[id] = "Beware the Jabberwock, my son!" + ut_map[id + 1] = "The jaws that bite, the claws that catch!" + DisplayServer.tts_speak("Beware the Jabberwock, my son!", vc[0], 50, 1, 1, id) + DisplayServer.tts_speak("The jaws that bite, the claws that catch!", vc[0], 50, 1, 1, id + 1) + id += 2 + #demo - es + vc = DisplayServer.tts_get_voices_for_language("es") + if !vc.is_empty(): + ut_map[id] = "¡Cuidado, hijo, con el Fablistanón!" + ut_map[id + 1] = "¡Con sus dientes y garras, muerde, apresa!" + DisplayServer.tts_speak("¡Cuidado, hijo, con el Fablistanón!", vc[0], 50, 1, 1, id) + DisplayServer.tts_speak("¡Con sus dientes y garras, muerde, apresa!", vc[0], 50, 1, 1, id + 1) + id += 2 + #demo - ru + vc = DisplayServer.tts_get_voices_for_language("ru") + if !vc.is_empty(): + ut_map[id] = "О, бойся Бармаглота, сын!" + ut_map[id + 1] = "Он так свирлеп и дик!" + DisplayServer.tts_speak("О, бойся Бармаглота, сын!", vc[0], 50, 1, 1, id) + DisplayServer.tts_speak("Он так свирлеп и дик!", vc[0], 50, 1, 1, id + 1) + id += 2 + +func _on_LineEditFilterName_text_changed(new_text): + $Tree.clear() + var root = $Tree.create_item() + for v in vs: + if ($LineEditFilterName.text.is_empty() || $LineEditFilterName.text.to_lower() in v["name"].to_lower()) && ($LineEditFilterLang.text.is_empty() || $LineEditFilterLang.text.to_lower() in v["language"].to_lower()): + var child = $Tree.create_item(root) + child.set_text(0, v["name"]) + child.set_metadata(0, v["id"]) + child.set_text(1, v["language"])