From 2ef31a6346ad02252786d25a003b04c8c393f720 Mon Sep 17 00:00:00 2001
From: Crysple <871886504@qq.com>
Date: Wed, 12 Aug 2020 10:09:44 -0400
Subject: [PATCH] Audio: add direction transformation code, refactor & fix bugs

---
 modules/audio/audio_component.cc              |  20 +--
 modules/audio/audio_component.h               |   3 +
 modules/audio/conf/audio_conf.pb.txt          |   1 +
 modules/audio/conf/respeaker_extrinsics.yaml  |  17 +++
 modules/audio/inference/BUILD                 |   3 +
 .../audio/inference/direction_detection.cc    | 129 +++++++++++++++---
 modules/audio/inference/direction_detection.h |  51 ++++++-
 modules/audio/proto/BUILD                     |   4 +
 modules/audio/proto/audio.proto               |   3 +
 modules/audio/proto/audio_conf.proto          |   1 +
 modules/dreamview/conf/vehicle_data.pb.txt    |   4 +
 11 files changed, 203 insertions(+), 33 deletions(-)
 create mode 100644 modules/audio/conf/respeaker_extrinsics.yaml
diff --git a/modules/audio/audio_component.cc b/modules/audio/audio_component.cc
index 6336fd5c26..f19864ab30 100644
--- a/modules/audio/audio_component.cc
+++ b/modules/audio/audio_component.cc
@@ -15,12 +15,13 @@
  *****************************************************************************/
 
 #include "modules/audio/audio_component.h"
-#include "modules/audio/inference/direction_detection.h"
 #include "modules/audio/proto/audio_conf.pb.h"
+#include "modules/common/proto/geometry.pb.h"
 
 namespace apollo {
 namespace audio {
 
+using apollo::common::Point3D;
 using apollo::drivers::microphone::config::AudioData;
 
 AudioComponent::~AudioComponent() {}
@@ -42,21 +43,24 @@ bool AudioComponent::Init() {
           audio_conf.topic_conf().localization_topic_name(), nullptr);
   audio_writer_ = node_->CreateWriter<AudioDetection>(
       audio_conf.topic_conf().audio_detection_topic_name());
+  respeaker_extrinsics_file_ = audio_conf.respeaker_extrinsics_path();
   return true;
 }
 
 bool AudioComponent::Proc(const std::shared_ptr<AudioData>& audio_data) {
-  audio_info_.Insert(audio_data);
-  AINFO << "Current direction is: "
-        << get_direction(
-               audio_info_.GetSignals(audio_data->microphone_config().chunk()),
-               audio_data->microphone_config().sample_rate(),
-               audio_data->microphone_config().mic_distance());
   // TODO(all) remove GetSignals() multiple calls
+  audio_info_.Insert(audio_data);
+  AudioDetection audio_detection;
+  *audio_detection.mutable_position() =
+      direction_detection_.EstimateSoundSource(
+          audio_info_.GetSignals(audio_data->microphone_config().chunk()),
+          respeaker_extrinsics_file_,
+          audio_data->microphone_config().sample_rate(),
+          audio_data->microphone_config().mic_distance());
+
   auto signals =
       audio_info_.GetSignals(audio_data->microphone_config().chunk());
   MovingResult moving_result = moving_detection_.Detect(signals);
-  AudioDetection audio_detection;
   audio_detection.set_moving_result(moving_result);
   // TODO(all) add header to audio_detection
   audio_writer_->Write(audio_detection);
diff --git a/modules/audio/audio_component.h b/modules/audio/audio_component.h
index 29ed836cdb..2315bdb4f9 100644
--- a/modules/audio/audio_component.h
+++ b/modules/audio/audio_component.h
@@ -25,6 +25,7 @@
 
 #include "cyber/component/component.h"
 #include "modules/audio/common/audio_info.h"
+#include "modules/audio/inference/direction_detection.h"
 #include "modules/audio/inference/moving_detection.h"
 #include "modules/audio/proto/audio.pb.h"
 #include "modules/drivers/microphone/proto/audio.pb.h"
@@ -59,6 +60,8 @@ class AudioComponent
   AudioInfo audio_info_;
 
   MovingDetection moving_detection_;
+  DirectionDetection direction_detection_;
+  std::string respeaker_extrinsics_file_;
 };
 
 CYBER_REGISTER_COMPONENT(AudioComponent)
diff --git a/modules/audio/conf/audio_conf.pb.txt b/modules/audio/conf/audio_conf.pb.txt
index f2070c1749..2c5eec9453 100644
--- a/modules/audio/conf/audio_conf.pb.txt
+++ b/modules/audio/conf/audio_conf.pb.txt
@@ -3,3 +3,4 @@ topic_conf {
   audio_detection_topic_name: "/apollo/audio_detection"
   localization_topic_name: "/apollo/localization/pose"
 }
+respeaker_extrinsics_path: "/apollo/modules/audio/conf/respeaker_extrinsics.yaml"
\ No newline at end of file
diff --git a/modules/audio/conf/respeaker_extrinsics.yaml b/modules/audio/conf/respeaker_extrinsics.yaml
new file mode 100644
index 0000000000..da8dc961f8
--- /dev/null
+++ b/modules/audio/conf/respeaker_extrinsics.yaml
@@ -0,0 +1,17 @@
+header:
+  stamp:
+    secs: 0
+    nsecs: 0
+  seq: 0
+  frame_id: novatel
+child_frame_id:  microphone
+transform:
+  rotation:
+    x: 0
+    y: 0
+    z: 0
+    w: 1
+  translation:
+    x: 0.0
+    y: 0.68
+    z: 0.72
diff --git a/modules/audio/inference/BUILD b/modules/audio/inference/BUILD
index a8a2efc210..17b7d144fd 100644
--- a/modules/audio/inference/BUILD
+++ b/modules/audio/inference/BUILD
@@ -19,7 +19,10 @@ cc_library(
     hdrs = ["direction_detection.h"],
     deps = [
         "//cyber",
+        "//modules/common/proto:geometry_cc_proto",
         "//third_party:libtorch",
+        "@com_github_jbeder_yaml_cpp//:yaml-cpp",
+        "@eigen",
     ],
 )
 
diff --git a/modules/audio/inference/direction_detection.cc b/modules/audio/inference/direction_detection.cc
index 3a1020153d..e136e325ba 100644
--- a/modules/audio/inference/direction_detection.cc
+++ b/modules/audio/inference/direction_detection.cc
@@ -15,6 +15,7 @@
  *****************************************************************************/
 
 #include "modules/audio/inference/direction_detection.h"
+#include "yaml-cpp/yaml.h"
 
 namespace apollo {
 namespace audio {
@@ -22,10 +23,37 @@ namespace audio {
 using torch::indexing::None;
 using torch::indexing::Slice;
 
-int get_direction(std::vector<std::vector<double>>&& channels_vec,
-                  const int sample_rate, const int mic_distance) {
+DirectionDetection::DirectionDetection() {}
+
+DirectionDetection::~DirectionDetection() {}
+
+Point3D DirectionDetection::EstimateSoundSource(
+    std::vector<std::vector<double>>&& channels_vec,
+    const std::string& respeaker_extrinsic_file, const int sample_rate,
+    const double mic_distance) {
+  if (!respeaker2imu_ptr_.get()) {
+    respeaker2imu_ptr_.reset(new Eigen::Matrix4d);
+    LoadExtrinsics(respeaker_extrinsic_file, respeaker2imu_ptr_.get());
+  }
+  const double degree =
+      EstimateDirection(move(channels_vec), sample_rate, mic_distance);
+  Eigen::Vector4d source_position(kDistance * sin(degree),
+                                  kDistance * cos(degree), 0, 1);
+  source_position = (*respeaker2imu_ptr_) * source_position;
+
+  Point3D source_position_p3d;
+  source_position_p3d.set_x(source_position[0]);
+  source_position_p3d.set_y(source_position[1]);
+  source_position_p3d.set_z(source_position[2]);
+
+  return source_position_p3d;
+}
+
+double DirectionDetection::EstimateDirection(
+    std::vector<std::vector<double>>&& channels_vec, const int sample_rate,
+    const double mic_distance) {
   std::vector<torch::Tensor> channels_ts;
-  auto options = torch::TensorOptions().dtype(torch::kFloat32);
+  auto options = torch::TensorOptions().dtype(torch::kFloat64);
   int size = static_cast<int>(channels_vec[0].size());
   for (auto& signal : channels_vec) {
     channels_ts.push_back(torch::from_blob(signal.data(), {size}, options));
@@ -33,10 +61,10 @@ int get_direction(std::vector<std::vector<double>>&& channels_vec,
 
   double tau0, tau1;
   int theta0, theta1;
-  const double max_tau = mic_distance / SOUND_SPEED;
-  tau0 = gcc_phat(channels_ts[0], channels_ts[2], sample_rate, max_tau, 1);
+  const double max_tau = mic_distance / kSoundSpeed;
+  tau0 = GccPhat(channels_ts[0], channels_ts[2], sample_rate, max_tau, 1);
   theta0 = asin(tau0 / max_tau) * 180 / M_PI;
-  tau1 = gcc_phat(channels_ts[1], channels_ts[3], sample_rate, max_tau, 1);
+  tau1 = GccPhat(channels_ts[1], channels_ts[3], sample_rate, max_tau, 1);
   theta1 = asin(tau1 / max_tau) * 180 / M_PI;
 
   int best_guess = 0;
@@ -48,32 +76,75 @@ int get_direction(std::vector<std::vector<double>>&& channels_vec,
   }
   best_guess = (-best_guess + 120) % 360;
 
-  return best_guess;
+  return static_cast<double>(best_guess) / 90 * M_PI;
 }
 
-/*
- * This function computes the offset between the signal sig and the reference
- * signal refsig using the Generalized Cross Correlation - Phase Transform
- * (GCC-PHAT)method.
- */
-double gcc_phat(const torch::Tensor& sig, const torch::Tensor& refsig, int fs,
-               double max_tau, int interp) {
+bool DirectionDetection::LoadExtrinsics(const std::string& yaml_file,
+                                        Eigen::Matrix4d* respeaker_extrinsic) {
+  if (!apollo::cyber::common::PathExists(yaml_file)) {
+    AINFO << yaml_file << " does not exist!";
+    return false;
+  }
+  YAML::Node node = YAML::LoadFile(yaml_file);
+  double qw = 0.0;
+  double qx = 0.0;
+  double qy = 0.0;
+  double qz = 0.0;
+  double tx = 0.0;
+  double ty = 0.0;
+  double tz = 0.0;
+  try {
+    if (node.IsNull()) {
+      AINFO << "Load " << yaml_file << " failed! please check!";
+      return false;
+    }
+    qw = node["transform"]["rotation"]["w"].as<double>();
+    qx = node["transform"]["rotation"]["x"].as<double>();
+    qy = node["transform"]["rotation"]["y"].as<double>();
+    qz = node["transform"]["rotation"]["z"].as<double>();
+    tx = node["transform"]["translation"]["x"].as<double>();
+    ty = node["transform"]["translation"]["y"].as<double>();
+    tz = node["transform"]["translation"]["z"].as<double>();
+  } catch (YAML::Exception& e) {
+    AERROR << "load camera extrinsic file " << yaml_file
+           << " with error, YAML exception:" << e.what();
+    return false;
+  }
+  respeaker_extrinsic->setConstant(0);
+  Eigen::Quaterniond q;
+  q.x() = qx;
+  q.y() = qy;
+  q.z() = qz;
+  q.w() = qw;
+  (*respeaker_extrinsic).block<3, 3>(0, 0) = q.normalized().toRotationMatrix();
+  (*respeaker_extrinsic)(0, 3) = tx;
+  (*respeaker_extrinsic)(1, 3) = ty;
+  (*respeaker_extrinsic)(2, 3) = tz;
+  (*respeaker_extrinsic)(3, 3) = 1;
+  return true;
+}
+
+double DirectionDetection::GccPhat(const torch::Tensor& sig,
+                                   const torch::Tensor& refsig, int fs,
+                                   double max_tau, int interp) {
   const int n_sig = sig.size(0), n_refsig = refsig.size(0),
             n = n_sig + n_refsig;
   torch::Tensor psig = at::constant_pad_nd(sig, {0, n_refsig}, 0);
   torch::Tensor prefsig = at::constant_pad_nd(refsig, {0, n_sig}, 0);
   psig = at::rfft(psig, 1, false, true);
   prefsig = at::rfft(prefsig, 1, false, true);
-  torch::Tensor r = psig * at::conj(prefsig);
-  torch::Tensor cc = at::irfft(r / at::abs(r), 1, false, true, {interp * n});
+
+  ConjugateTensor(&prefsig);
+  torch::Tensor r = ComplexMultiply(psig, prefsig);
+  torch::Tensor cc =
+      at::irfft(r / ComplexAbsolute(r), 1, false, true, {interp * n});
   int max_shift = static_cast<int>(interp * n / 2);
   if (max_tau != 0)
     max_shift = std::min(static_cast<int>(interp * fs * max_tau), max_shift);
 
-  auto begin = cc.index({Slice(0, cc.size(0) - max_shift, None)});
-  auto end = cc.index({Slice(0, None, max_shift + 1)});
+  auto begin = cc.index({Slice(cc.size(0) - max_shift, None)});
+  auto end = cc.index({Slice(None, max_shift + 1)});
   cc = at::cat({begin, end});
-  auto ttt = at::argmax(at::abs(cc), 0);
   // find max cross correlation index
   const int shift = at::argmax(at::abs(cc), 0).item<int>() - max_shift;
   const double tau = shift / static_cast<double>(interp * fs);
@@ -81,5 +152,25 @@ double gcc_phat(const torch::Tensor& sig, const torch::Tensor& refsig, int fs,
   return tau;
 }
 
+void DirectionDetection::ConjugateTensor(torch::Tensor* tensor) {
+  tensor->index_put_({"...", 1}, -tensor->index({"...", 1}));
+}
+
+torch::Tensor DirectionDetection::ComplexMultiply(const torch::Tensor& a,
+                                                  const torch::Tensor& b) {
+  torch::Tensor real = a.index({"...", 0}) * b.index({"...", 0}) -
+                       a.index({"...", 1}) * b.index({"...", 1});
+  torch::Tensor imag = a.index({"...", 0}) * b.index({"...", 1}) +
+                       a.index({"...", 1}) * b.index({"...", 0});
+  return at::cat({real.reshape({-1, 1}), imag.reshape({-1, 1})}, 1);
+}
+
+torch::Tensor DirectionDetection::ComplexAbsolute(const torch::Tensor& tensor) {
+  torch::Tensor res = tensor * tensor;
+  res = at::sqrt(res.sum(1)).reshape({-1, 1});
+
+  return res;
+}
+
 }  // namespace audio
 }  // namespace apollo
diff --git a/modules/audio/inference/direction_detection.h b/modules/audio/inference/direction_detection.h
index 33e54402cf..7fc26f4d4f 100644
--- a/modules/audio/inference/direction_detection.h
+++ b/modules/audio/inference/direction_detection.h
@@ -19,21 +19,60 @@
 
 #include <algorithm>
 #include <cmath>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "Eigen/Eigen"
+// Eigen 3.3.7: #define ALIVE (0)
+// fastrtps: enum ChangeKind_t { ALIVE, ... };
+#if defined(ALIVE)
+#undef ALIVE
+#endif
+
 #include "ATen/ATen.h"
-#include "cyber/cyber.h"
 #include "torch/torch.h"
 
+#include "cyber/cyber.h"
+#include "modules/common/proto/geometry.pb.h"
+
 namespace apollo {
 namespace audio {
 
-constexpr double SOUND_SPEED = 343.2;
+using apollo::common::Point3D;
+
+class DirectionDetection {
+ public:
+  DirectionDetection();
+  ~DirectionDetection();
+  // Estimates the position of the source of the sound
+  Point3D EstimateSoundSource(std::vector<std::vector<double>>&& channels_vec,
+                              const std::string& respeaker_extrinsic_file,
+                              const int sample_rate, const double mic_distance);
+
+ private:
+  const double kSoundSpeed = 343.2;
+  const int kDistance = 50;
+  std::unique_ptr<Eigen::Matrix4d> respeaker2imu_ptr_;
+
+  // Estimates the direction of the source of the sound
+  double EstimateDirection(std::vector<std::vector<double>>&& channels_vec,
+                           const int sample_rate, const double mic_distance);
+
+  bool LoadExtrinsics(const std::string& yaml_file,
+                      Eigen::Matrix4d* respeaker_extrinsic);
+
+  // Computes the offset between the signal sig and the reference signal refsig
+  // using the Generalized Cross Correlation - Phase Transform (GCC-PHAT)method.
+  double GccPhat(const torch::Tensor& sig, const torch::Tensor& refsig, int fs,
+                 double max_tau, int interp);
 
-double gcc_phat(const torch::Tensor& sig, const torch::Tensor& refsig, int fs,
-               double max_tau, int interp);
-int get_direction(std::vector<std::vector<double>>&& channels_vec,
-                  const int sample_rate, const int mic_distance);
+  // Libtorch does not support Complex type currently.
+  void ConjugateTensor(torch::Tensor* tensor);
+  torch::Tensor ComplexMultiply(const torch::Tensor& a, const torch::Tensor& b);
+  torch::Tensor ComplexAbsolute(const torch::Tensor& tensor);
+};
 
 }  // namespace audio
 }  // namespace apollo
diff --git a/modules/audio/proto/BUILD b/modules/audio/proto/BUILD
index 3d3709f975..b58bfe1452 100644
--- a/modules/audio/proto/BUILD
+++ b/modules/audio/proto/BUILD
@@ -15,12 +15,16 @@ cc_proto_library(
 proto_library(
     name = "audio_proto",
     srcs = ["audio.proto"],
+    deps = [
+        "//modules/common/proto:geometry_proto",
+    ],
 )
 
 py_proto_library(
     name = "audio_py_pb2",
     deps = [
         ":audio_proto",
+        "//modules/common/proto:geometry_py_pb2",
     ],
 )
 
diff --git a/modules/audio/proto/audio.proto b/modules/audio/proto/audio.proto
index a0594a4ccb..cffe39c1fe 100644
--- a/modules/audio/proto/audio.proto
+++ b/modules/audio/proto/audio.proto
@@ -18,6 +18,8 @@ syntax = "proto2";
 
 package apollo.audio;
 
+import "modules/common/proto/geometry.proto";
+
 enum MovingResult {
   UNKNOWN = 0;
   APPROACHING = 1;
@@ -28,4 +30,5 @@ enum MovingResult {
 message AudioDetection {
   optional bool is_siren = 1;
   optional MovingResult moving_result = 2 [default = UNKNOWN];
+  optional apollo.common.Point3D position = 3;
 }
diff --git a/modules/audio/proto/audio_conf.proto b/modules/audio/proto/audio_conf.proto
index 409189c6a0..7193044e32 100644
--- a/modules/audio/proto/audio_conf.proto
+++ b/modules/audio/proto/audio_conf.proto
@@ -26,4 +26,5 @@ message TopicConf {
 
 message AudioConf {
   optional TopicConf topic_conf = 1;
+  optional string respeaker_extrinsics_path = 2;
 }
diff --git a/modules/dreamview/conf/vehicle_data.pb.txt b/modules/dreamview/conf/vehicle_data.pb.txt
index 069c0bafe5..6856e0a71a 100644
--- a/modules/dreamview/conf/vehicle_data.pb.txt
+++ b/modules/dreamview/conf/vehicle_data.pb.txt
@@ -42,6 +42,10 @@ data_files {
   source_path: "radar_params"
   dest_path: "/apollo/modules/perception/data/params"
 }
+data_files {
+  source_path: "microphone_params"
+  dest_path: "/apollo/modules/audio/conf/respeaker_extrinsics.yaml"
+}
 data_files {
   source_path: "gnss_params/ant_imu_leverarm.yaml"
   dest_path: "/apollo/modules/localization/msf/params/gnss_params/ant_imu_leverarm.yaml"
-- 
GitLab