Create Audio Feature in SDK (#344)

* Create Audio Feature in SDK * Add apis to record audio in SDK * Add corresponding apis in pybind, storage.py, sdk.h * Implement reservoir sampling when collecting audio samples * fix clang format and update based on comment * use int_8 for reading records and convert string directly from vector * refract isSampleTaken to IndexOfSampleTaken * fix clang format again

Create Audio Feature in SDK (#344)
* Create Audio Feature in SDK * Add apis to record audio in SDK * Add corresponding apis in pybind, storage.py, sdk.h * Implement reservoir sampling when collecting audio samples * fix clang format and update based on comment * use int_8 for reading records and convert string directly from vector * refract isSampleTaken to IndexOfSampleTaken * fix clang format again
37a3559a · Nicky Chan · GitHub · dbc68507 · 37a3559a · 37a3559a
7 changed file
--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -79,9 +79,14 @@ PYBIND11_MODULE(core, m) {
             auto tablet = self.tablet(tag);
             return vs::components::ImageReader(self.mode(), tablet);
           })
-      .def("get_text", [](vs::LogReader& self, const std::string& tag) {
+      .def("get_text",
+           [](vs::LogReader& self, const std::string& tag) {
+             auto tablet = self.tablet(tag);
+             return vs::components::TextReader(tablet);
+           })
+      .def("get_audio", [](vs::LogReader& self, const std::string& tag) {
        auto tablet = self.tablet(tag);
-        return vs::components::TextReader(tablet);
+        return vs::components::AudioReader(self.mode(), tablet);
      });
  // clang-format on
@@ -119,10 +124,19 @@ PYBIND11_MODULE(core, m) {
             auto tablet = self.AddTablet(tag);
             return vs::components::Image(tablet, num_samples, step_cycle);
           })
-      .def("new_text", [](vs::LogWriter& self, const std::string& tag) {
+      .def("new_text",
-        auto tablet = self.AddTablet(tag);
+           [](vs::LogWriter& self, const std::string& tag) {
-        return vs::components::Text(tablet);
+             auto tablet = self.AddTablet(tag);
-      });
+             return vs::components::Text(tablet);
+           })
+      .def("new_audio",
+           [](vs::LogWriter& self,
+              const std::string& tag,
+              int num_samples,
+              int step_cycle) {
+             auto tablet = self.AddTablet(tag);
+             return vs::components::Audio(tablet, num_samples, step_cycle);
+           });
 //------------------- components --------------------
 #define ADD_SCALAR_READER(T)                               \
@@ -161,7 +175,7 @@ PYBIND11_MODULE(core, m) {
      .def("start_sampling", &cp::Image::StartSampling, R"pbdoc(
        Start a sampling period, this interface will start a new reservoir sampling phase.
      )pbdoc")
-      .def("is_sample_taken", &cp::Image::IsSampleTaken, R"pbdoc(
+      .def("is_sample_taken", &cp::Image::IndexOfSampleTaken, R"pbdoc(
        Will this sample be taken, this interface is introduced to reduce the cost
        of copy image data, by testing whether this image will be sampled, and only
        copy data when it should be sampled. In that way, most of un-sampled image
@@ -219,6 +233,61 @@ PYBIND11_MODULE(core, m) {
      .def("total_records", &cp::TextReader::total_records)
      .def("size", &cp::TextReader::size);
+  py::class_<cp::Audio>(m, "AudioWriter", R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+      .def("set_caption", &cp::Audio::SetCaption, R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+      .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
+            Start a sampling period, this interface will start a new reservoir sampling phase.
+          )pbdoc")
+      .def("is_sample_taken", &cp::Audio::IndexOfSampleTaken, R"pbdoc(
+            Will this sample be taken, this interface is introduced to reduce the cost
+            of copy audio data, by testing whether this audio will be sampled, and only
+            copy data when it should be sampled. In that way, most of un-sampled audio
+            data need not be copied or processed at all.
+            :return: Index
+            :rtype: integer
+                  )pbdoc")
+      .def("finish_sampling", &cp::Audio::FinishSampling, R"pbdoc(
+            End a sampling period, it will clear all states for reservoir sampling.
+          )pbdoc")
+      .def("set_sample", &cp::Audio::SetSample, R"pbdoc(
+            Store the flatten audio data with sample rate specified.
+            :param index:
+            :type index: integer
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc")
+      .def("add_sample", &cp::Audio::AddSample, R"pbdoc(
+            A combined interface for is_sample_taken and set_sample, simpler but is less efficient.
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc");
+  py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
+      // TODO(Nicky) make these copyless.
+      .def("data", [](cp::AudioReader::AudioRecord& self) { return self.data; })
+      .def("sample_rate",
+           [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
+      .def("step_id",
+           [](cp::AudioReader::AudioRecord& self) { return self.step_id; });
+  py::class_<cp::AudioReader>(m, "AudioReader")
+      .def("caption", &cp::AudioReader::caption)
+      .def("num_records", &cp::AudioReader::num_records)
+      .def("num_samples", &cp::AudioReader::num_samples)
+      .def("record", &cp::AudioReader::record)
+      .def("timestamp", &cp::AudioReader::timestamp);
 #define ADD_HISTOGRAM_WRITER(T)                                          \
  py::class_<cp::Histogram<T>>(m, "HistogramWriter__" #T, \ 
   R"pbdoc(PyBind class. Must instantiate through the LogWriter.)pbdoc") \

--- a/visualdl/logic/sdk.cc
+++ b/visualdl/logic/sdk.cc
@@ -160,7 +160,7 @@ void Image::StartSampling() {
  num_records_ = 0;
 }
-int Image::IsSampleTaken() {
+int Image::IndexOfSampleTaken() {
  if (!ToSampleThisStep()) return -1;
  num_records_++;
  if (num_records_ <= num_samples_) {
@@ -195,7 +195,7 @@ struct is_same_type<T, T> {
 void Image::AddSample(const std::vector<shape_t>& shape,
                      const std::vector<value_t>& data) {
-  auto idx = IsSampleTaken();
+  auto idx = IndexOfSampleTaken();
  if (idx >= 0) {
    SetSample(idx, shape, data);
  }
@@ -222,11 +222,6 @@ void Image::SetSample(int index,
  CHECK_LT(index, num_samples_);
  CHECK_LE(index, num_records_);
-  // trick to store int8 to protobuf
-  std::vector<byte_t> data_str(data.size());
-  for (int i = 0; i < data.size(); i++) {
-    data_str[i] = data[i];
-  }
  Uint8Image image(new_shape[2], new_shape[0] * new_shape[1]);
  NormalizeImage(&image, &data[0], new_shape[0] * new_shape[1], new_shape[2]);
@@ -352,6 +347,105 @@ std::string TextReader::caption() const {
 size_t TextReader::size() const { return reader_.total_records(); }
+void Audio::StartSampling() {
+  if (!ToSampleThisStep()) return;
+  step_ = writer_.AddRecord();
+  step_.SetId(step_id_);
+  time_t time = std::time(nullptr);
+  step_.SetTimeStamp(time);
+  // resize record
+  for (int i = 0; i < num_samples_; i++) {
+    step_.AddData();
+  }
+  num_records_ = 0;
+}
+int Audio::IndexOfSampleTaken() {
+  if (!ToSampleThisStep()) return -1;
+  num_records_++;
+  if (num_records_ <= num_samples_) {
+    return num_records_ - 1;
+  }
+  float prob = float(num_samples_) / num_records_;
+  float randv = (float)rand() / RAND_MAX;
+  if (randv < prob) {
+    // take this sample
+    int index = rand() % num_samples_;
+    return index;
+  }
+  return -1;
+}
+void Audio::FinishSampling() {
+  step_id_++;
+  if (ToSampleThisStep()) {
+    writer_.parent()->PersistToDisk();
+  }
+}
+void Audio::AddSample(int sample_rate, const std::vector<value_t>& data) {
+  auto idx = IndexOfSampleTaken();
+  if (idx >= 0) {
+    SetSample(idx, sample_rate, data);
+  }
+}
+void Audio::SetSample(int index,
+                      int sample_rate,
+                      const std::vector<value_t>& data) {
+  CHECK_GT(sample_rate, 0)
+      << "sample rate should be something like 6000, 8000 or 44100";
+  CHECK_LT(index, num_samples_)
+      << "index should be less than number of samples";
+  CHECK_LE(index, num_records_)
+      << "index should be less than or equal to number of records";
+  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
+                    std::string(data.begin(), data.end()));
+  brcd.tofile();
+  auto entry = step_.MutableData<std::vector<byte_t>>(index);
+  // update record
+  auto old_hash = entry.reader().GetRaw();
+  if (!old_hash.empty()) {
+    std::string old_path =
+        GenBinaryRecordDir(step_.parent()->dir()) + "/" + old_hash;
+    CHECK_EQ(std::remove(old_path.c_str()), 0) << "delete old binary record "
+                                               << old_path << " failed";
+  }
+  entry.SetRaw(brcd.filename());
+}
+std::string AudioReader::caption() {
+  CHECK_EQ(reader_.captions().size(), 1);
+  auto caption = reader_.captions().front();
+  if (LogReader::TagMatchMode(caption, mode_)) {
+    return LogReader::GenReadableTag(mode_, caption);
+  }
+  string::TagDecode(caption);
+  return caption;
+}
+AudioReader::AudioRecord AudioReader::record(int offset, int index) {
+  AudioRecord res;
+  auto record = reader_.record(offset);
+  auto entry = record.data(index);
+  auto filename = entry.GetRaw();
+  CHECK(!g_log_dir.empty())
+      << "g_log_dir should be set in LogReader construction";
+  BinaryRecordReader brcd(GenBinaryRecordDir(g_log_dir), filename);
+  std::transform(brcd.data.begin(),
+                 brcd.data.end(),
+                 std::back_inserter(res.data),
+                 [](byte_t i) { return (int8_t)(i); });
+  res.step_id = record.id();
+  return res;
+}
 }  // namespace components
 }  // namespace visualdl
--- a/visualdl/logic/sdk.h
+++ b/visualdl/logic/sdk.h
@@ -170,8 +170,9 @@ struct Image {
  void FinishSampling();
  /*
-   * A combined interface for IsSampleTaken and SetSample, simpler but might be
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but
-   * low effience.
+   * might be
+   * low efficiency.
   */
  void AddSample(const std::vector<shape_t>& shape,
                 const std::vector<value_t>& data);
@@ -182,7 +183,7 @@ struct Image {
   * copy data when it should be sampled. In that way, most of unsampled image
   * data need not be copied or processed at all.
   */
-  int IsSampleTaken();
+  int IndexOfSampleTaken();
  /*
   * Just store a tensor with nothing to do with image format.
   */
@@ -326,6 +327,115 @@ private:
  TabletReader reader_;
 };
+/*
+ * Image component writer.
+ */
+struct Audio {
+  using value_t = float;
+  /*
+   * step_cycle: store every `step_cycle` as a record.
+   * num_samples: how many samples to take in a step.
+   */
+  Audio(Tablet tablet, int num_samples, int step_cycle)
+      : writer_(tablet), num_samples_(num_samples), step_cycle_(step_cycle) {
+    CHECK_GT(step_cycle, 0);
+    CHECK_GT(num_samples, 0);
+    writer_.SetType(Tablet::Type::kAudio);
+    // make audio's tag as the default caption.
+    writer_.SetNumSamples(num_samples);
+    SetCaption(tablet.reader().tag());
+  }
+  void SetCaption(const std::string& c) {
+    writer_.SetCaptions(std::vector<std::string>({c}));
+  }
+  /*
+   * Start a sampling period, this interface will start a new reservior sampling
+   * phase.
+   */
+  void StartSampling();
+  /*
+   * End a sampling period, it will clear all states for reservior sampling.
+   */
+  void FinishSampling();
+  /*
+   * A combined interface for IndexOfSampleTaken and SetSample, simpler but
+   * might be
+   * low efficiency.
+   */
+  void AddSample(int sample_rate, const std::vector<value_t>& data);
+  /*
+   * Will this sample be taken, this interface is introduced to reduce the cost
+   * of copy audio data, by testing whether this audio will be sampled, and only
+   * copy data when it should be sampled. In that way, most of unsampled audio
+   * data need not be copied or processed at all.
+   */
+  int IndexOfSampleTaken();
+  /*
+   * Store audio data with sample rate
+   */
+  void SetSample(int index, int sample_rate, const std::vector<value_t>& data);
+protected:
+  bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
+private:
+  Tablet writer_;
+  Record step_;
+  int num_records_{0};
+  int num_samples_{0};
+  int step_id_{0};
+  int step_cycle_;
+};
+/*
+* Audio reader.
+*/
+struct AudioReader {
+  using value_t = typename Audio::value_t;
+  struct AudioRecord {
+    int step_id;
+    int sample_rate;
+    std::vector<int8_t> data;
+  };
+  AudioReader(const std::string& mode, TabletReader tablet)
+      : reader_(tablet), mode_{mode} {}
+  std::string caption();
+  // number of steps.
+  int num_records() { return reader_.total_records(); }
+  int num_samples() { return reader_.num_samples(); }
+  int64_t timestamp(int step) { return reader_.record(step).timestamp(); }
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  AudioRecord record(int offset, int index);
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  std::vector<value_t> data(int offset, int index);
+  int stepid(int offset, int index);
+private:
+  TabletReader reader_;
+  std::string mode_;
+};
 }  // namespace components
 }  // namespace visualdl

--- a/visualdl/logic/sdk_test.cc
+++ b/visualdl/logic/sdk_test.cc
@@ -132,6 +132,40 @@ TEST(Image, add_sample_test) {
  CHECK_EQ(image2read.num_records(), num_steps);
 }
+TEST(Image, add_sample_test) {
+  const auto dir = "./tmp/sdk_test.image";
+  LogWriter writer__(dir, 4);
+  auto writer = writer__.AsMode("train");
+  auto tablet = writer.AddTablet("image0");
+  components::Image image(tablet, 3, 1);
+  const int num_steps = 10;
+  LOG(INFO) << "write images";
+  image.SetCaption("this is an image");
+  for (int step = 0; step < num_steps; step++) {
+    image.StartSampling();
+    for (int i = 0; i < 7; i++) {
+      vector<int64_t> shape({5, 5, 3});
+      vector<float> data;
+      for (int j = 0; j < 3 * 5 * 5; j++) {
+        data.push_back(float(rand()) / RAND_MAX);
+      }
+      image.AddSample(shape, data);
+    }
+    image.FinishSampling();
+  }
+  LOG(INFO) << "read images";
+  // read it
+  LogReader reader__(dir);
+  auto reader = reader__.AsMode("train");
+  auto tablet2read = reader.tablet("image0");
+  components::ImageReader image2read("train", tablet2read);
+  CHECK_EQ(image2read.caption(), "this is an image");
+  CHECK_EQ(image2read.num_records(), num_steps);
+}
 TEST(Histogram, AddRecord) {
  const auto dir = "./tmp/sdk_test.histogram";
  LogWriter writer__(dir, 1);

--- a/visualdl/python/storage.py
+++ b/visualdl/python/storage.py
@@ -119,6 +119,16 @@ class LogReader(object):
        check_tag_name_valid(tag)
        return self.reader.get_text(tag)
+    def audio(self, tag):
+        """
+        Get an audio reader with tag
+        :param tag:  The reader will read the audio data marked with tag
+        :type tag: basestring
+        """
+        check_tag_name_valid(tag)
+        return self.reader.get_audio(tag)
    def __enter__(self):
        return self
@@ -226,6 +236,22 @@ class LogWriter(object):
        }
        return types[type](tag, num_buckets)
+    def audio(self, tag, num_samples, step_cycle=1):
+        """
+        Create an audio writer that used to write audio data.
+        :param tag: The audio writer will label the audio with tag
+        :type tag: basestring
+        :param num_samples: how many samples to take in a step.
+        :type num_samples: integer
+        :param step_cycle: store every `step_cycle` as a record.
+        :type step_cycle: integer
+        :return: A audio writer to sample audio
+        :rtype: AudioWriter
+        """
+        check_tag_name_valid(tag)
+        return self.writer.new_audio(tag, num_samples, step_cycle)
    def text(self, tag):
        check_tag_name_valid(tag)
        return self.writer.new_text(tag)

--- a/visualdl/storage/storage.proto
+++ b/visualdl/storage/storage.proto
@@ -108,6 +108,7 @@ message Tablet {
    kHistogram = 1;
    kImage = 2;
    kText = 3;
+    kAudio = 4;
  }
  // The unique identification for this `Tablet`. VisualDL will have no the
  // concept of FileWriter like TB. It will store all the tablets in a single

--- a/visualdl/storage/tablet.h
+++ b/visualdl/storage/tablet.h
@@ -34,6 +34,7 @@ struct Tablet {
    kHistogram = 1,
    kImage = 2,
    kText = 3,
+    kAudio = 4,
    kUnknown = -1
  };
@@ -55,6 +56,9 @@ struct Tablet {
    if (name == "text") {
      return kText;
    }
+    if (name == "audio") {
+      return kAudio;
+    }
    LOG(ERROR) << "unknown component: " << name;
    return kUnknown;
  }