Add push to talk voice assistant (#4648)

* Add push to talk voice assistant

* Refactor most code into voice_assistant

* Make voice_assistant the component and remove push_to_talk (can be done in yaml)

* Fix component setup

* Always AF_INET to match serverside

* Fix microphone and media player co-existence

* Format

* Update codeowners

* Update test file

* Fix endifs

* nullptr not NULL

* clang-tidy

* Format

* fixup: Add VA event data

* Generate proto

* Parse and log events

* Add default to switch

* Fix

* Add mic/va to test5
This commit is contained in:
Jesse Hills
2023-04-12 11:45:10 +12:00
committed by GitHub
parent 80bc567c31
commit b60c08dd28
35 changed files with 1384 additions and 75 deletions
@@ -0,0 +1,57 @@
import esphome.config_validation as cv
import esphome.codegen as cg
from esphome.const import CONF_ID, CONF_MICROPHONE
from esphome import automation
from esphome.automation import register_action
from esphome.components import microphone
AUTO_LOAD = ["socket"]
DEPENDENCIES = ["api", "microphone"]
CODEOWNERS = ["@jesserockz"]
voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant")
VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component)
StartAction = voice_assistant_ns.class_(
"StartAction", automation.Action, cg.Parented.template(VoiceAssistant)
)
StopAction = voice_assistant_ns.class_(
"StopAction", automation.Action, cg.Parented.template(VoiceAssistant)
)
CONFIG_SCHEMA = cv.Schema(
{
cv.GenerateID(): cv.declare_id(VoiceAssistant),
cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone),
}
).extend(cv.COMPONENT_SCHEMA)
async def to_code(config):
var = cg.new_Pvariable(config[CONF_ID])
await cg.register_component(var, config)
mic = await cg.get_variable(config[CONF_MICROPHONE])
cg.add(var.set_microphone(mic))
cg.add_define("USE_VOICE_ASSISTANT")
VOICE_ASSISTANT_ACTION_SCHEMA = cv.Schema({cv.GenerateID(): cv.use_id(VoiceAssistant)})
@register_action("voice_assistant.start", StartAction, VOICE_ASSISTANT_ACTION_SCHEMA)
async def voice_assistant_listen_to_code(config, action_id, template_arg, args):
var = cg.new_Pvariable(action_id, template_arg)
await cg.register_parented(var, config[CONF_ID])
return var
@register_action("voice_assistant.stop", StopAction, VOICE_ASSISTANT_ACTION_SCHEMA)
async def voice_assistant_stop_to_code(config, action_id, template_arg, args):
var = cg.new_Pvariable(action_id, template_arg)
await cg.register_parented(var, config[CONF_ID])
return var
@@ -0,0 +1,148 @@
#include "voice_assistant.h"
#include "esphome/core/log.h"
namespace esphome {
namespace voice_assistant {
static const char *const TAG = "voice_assistant";
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
void VoiceAssistant::setup() {
ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
global_voice_assistant = this;
this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
if (socket_ == nullptr) {
ESP_LOGW(TAG, "Could not create socket.");
this->mark_failed();
return;
}
int enable = 1;
int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
if (err != 0) {
ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
// we can still continue
}
err = socket_->setblocking(false);
if (err != 0) {
ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err);
this->mark_failed();
return;
}
this->mic_->add_data_callback([this](const std::vector<uint8_t> &data) {
if (!this->running_) {
return;
}
this->socket_->sendto(data.data(), data.size(), 0, (struct sockaddr *) &this->dest_addr_, sizeof(this->dest_addr_));
});
}
void VoiceAssistant::start(struct sockaddr_storage *addr, uint16_t port) {
ESP_LOGD(TAG, "Starting...");
memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
if (this->dest_addr_.ss_family == AF_INET) {
((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
}
#if LWIP_IPV6
else if (this->dest_addr_.ss_family == AF_INET6) {
((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
}
#endif
else {
ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
return;
}
this->running_ = true;
this->mic_->start();
}
void VoiceAssistant::request_start() {
ESP_LOGD(TAG, "Requesting start...");
api::global_api_server->start_voice_assistant();
}
void VoiceAssistant::signal_stop() {
ESP_LOGD(TAG, "Signaling stop...");
this->mic_->stop();
this->running_ = false;
api::global_api_server->stop_voice_assistant();
memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
}
void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
switch (msg.event_type) {
case api::enums::VOICE_ASSISTANT_RUN_END:
ESP_LOGD(TAG, "Voice Assistant ended.");
break;
case api::enums::VOICE_ASSISTANT_STT_END: {
std::string text;
for (auto arg : msg.data) {
if (arg.name == "text") {
text = std::move(arg.value);
}
}
if (text.empty()) {
ESP_LOGW(TAG, "No text in STT_END event.");
return;
}
ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
// TODO `on_stt_end` trigger
break;
}
case api::enums::VOICE_ASSISTANT_TTS_START: {
std::string text;
for (auto arg : msg.data) {
if (arg.name == "text") {
text = std::move(arg.value);
}
}
if (text.empty()) {
ESP_LOGW(TAG, "No text in TTS_START event.");
return;
}
ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
// TODO `on_tts_start` trigger
break;
}
case api::enums::VOICE_ASSISTANT_TTS_END: {
std::string url;
for (auto arg : msg.data) {
if (arg.name == "url") {
url = std::move(arg.value);
}
}
if (url.empty()) {
ESP_LOGW(TAG, "No url in TTS_END event.");
return;
}
ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
// TODO `on_tts_end` trigger
break;
}
case api::enums::VOICE_ASSISTANT_ERROR: {
std::string code = "";
std::string message = "";
for (auto arg : msg.data) {
if (arg.name == "code") {
code = std::move(arg.value);
} else if (arg.name == "message") {
message = std::move(arg.value);
}
}
ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
// TODO `on_error` trigger
}
default:
break;
}
}
VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
} // namespace voice_assistant
} // namespace esphome
@@ -0,0 +1,50 @@
#pragma once
#include "esphome/core/automation.h"
#include "esphome/core/component.h"
#include "esphome/core/helpers.h"
#include "esphome/components/api/api_pb2.h"
#include "esphome/components/api/api_server.h"
#include "esphome/components/microphone/microphone.h"
#include "esphome/components/socket/socket.h"
namespace esphome {
namespace voice_assistant {
class VoiceAssistant : public Component {
public:
void setup() override;
float get_setup_priority() const override;
void start(struct sockaddr_storage *addr, uint16_t port);
void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; }
void request_start();
void signal_stop();
void on_event(const api::VoiceAssistantEventResponse &msg);
protected:
std::unique_ptr<socket::Socket> socket_ = nullptr;
struct sockaddr_storage dest_addr_;
microphone::Microphone *mic_{nullptr};
bool running_{false};
};
template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
public:
void play(Ts... x) override { this->parent_->request_start(); }
};
template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<VoiceAssistant> {
public:
void play(Ts... x) override { this->parent_->signal_stop(); }
};
extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
} // namespace voice_assistant
} // namespace esphome