提交 e3d73acd 编写于 作者: H Hui Zhang

fix io; add test

上级 4b5410ee
# Locales
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
export LANGUAGE=en_US.UTF-8
# Aliases
alias nvs="nvidia-smi"
alias rsync="rsync --progress -raz"
alias his="history"
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "extensive-venice",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x\n"
]
},
{
"data": {
"text/plain": [
"'/workspace/DeepSpeech-2.x'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%cd ..\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "correct-window",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"manifest.dev\t manifest.test-clean\t manifest.train\r\n",
"manifest.dev.raw manifest.test-clean.raw manifest.train.raw\r\n"
]
}
],
"source": [
"!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "exceptional-cheese",
"metadata": {},
"outputs": [],
"source": [
"dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "extraordinary-orleans",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"register user softmax to paddle, remove this when fixed!\n",
"register user log_softmax to paddle, remove this when fixed!\n",
"register user sigmoid to paddle, remove this when fixed!\n",
"register user log_sigmoid to paddle, remove this when fixed!\n",
"register user relu to paddle, remove this when fixed!\n",
"override cat of paddle if exists or register, remove this when fixed!\n",
"override long of paddle.Tensor if exists or register, remove this when fixed!\n",
"override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
"override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"override eq of paddle if exists or register, remove this when fixed!\n",
"override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"register user view to paddle.Tensor, remove this when fixed!\n",
"register user view_as to paddle.Tensor, remove this when fixed!\n",
"register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"register user fill_ to paddle.Tensor, remove this when fixed!\n",
"register user repeat to paddle.Tensor, remove this when fixed!\n",
"register user softmax to paddle.Tensor, remove this when fixed!\n",
"register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"register user relu to paddle.Tensor, remove this when fixed!\n",
"register user type_as to paddle.Tensor, remove this when fixed!\n",
"register user to to paddle.Tensor, remove this when fixed!\n",
"register user float to paddle.Tensor, remove this when fixed!\n",
"register user int to paddle.Tensor, remove this when fixed!\n",
"register user GLU to paddle.nn, remove this when fixed!\n",
"register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"register user export to paddle.jit, remove this when fixed!\n"
]
}
],
"source": [
"from deepspeech.frontend.utility import read_manifest"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "returning-lighter",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"dev_json = read_manifest(dev_data)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "western-founder",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input': [{'feat': '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.1.ark:16',\n",
" 'name': 'input1',\n",
" 'shape': [1063, 83]}],\n",
" 'output': [{'name': 'target1',\n",
" 'shape': [41, 5002],\n",
" 'text': 'AS I APPROACHED THE CITY I HEARD BELLS RINGING AND A '\n",
" 'LITTLE LATER I FOUND THE STREETS ASTIR WITH THRONGS OF '\n",
" 'WELL DRESSED PEOPLE IN FAMILY GROUPS WENDING THEIR WAY '\n",
" 'HITHER AND THITHER',\n",
" 'token': '▁AS ▁I ▁APPROACHED ▁THE ▁CITY ▁I ▁HEARD ▁BELL S ▁RING '\n",
" 'ING ▁AND ▁A ▁LITTLE ▁LATER ▁I ▁FOUND ▁THE ▁STREETS ▁AS '\n",
" 'T IR ▁WITH ▁THRONG S ▁OF ▁WELL ▁DRESSED ▁PEOPLE ▁IN '\n",
" '▁FAMILY ▁GROUP S ▁WE ND ING ▁THEIR ▁WAY ▁HITHER ▁AND '\n",
" '▁THITHER',\n",
" 'tokenid': '713 2458 676 4502 1155 2458 2351 849 389 3831 206 627 '\n",
" '482 2812 2728 2458 2104 4502 4316 713 404 212 4925 '\n",
" '4549 389 3204 4861 1677 3339 2495 1950 2279 389 4845 '\n",
" '302 206 4504 4843 2394 627 4526'}],\n",
" 'utt': '116-288045-0000',\n",
" 'utt2spk': '116-288045'}\n",
"5542\n",
"<class 'list'>\n"
]
}
],
"source": [
"from pprint import pprint\n",
"pprint(dev_json[0])\n",
"print(len(dev_json))\n",
"print(type(dev_json))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "motivated-receptor",
"metadata": {},
"outputs": [],
"source": [
"# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"import itertools\n",
"\n",
"import numpy as np\n",
"\n",
"from deepspeech.utils.log import Log\n",
"\n",
"__all__ = [\"make_batchset\"]\n",
"\n",
"logger = Log(__name__).getlog()\n",
"\n",
"\n",
"def batchfy_by_seq(\n",
" sorted_data,\n",
" batch_size,\n",
" max_length_in,\n",
" max_length_out,\n",
" min_batch_size=1,\n",
" shortest_first=False,\n",
" ikey=\"input\",\n",
" iaxis=0,\n",
" okey=\"output\",\n",
" oaxis=0, ):\n",
" \"\"\"Make batch set from json dictionary\n",
"\n",
" :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n",
" :param int batch_size: batch size\n",
" :param int max_length_in: maximum length of input to decide adaptive batch size\n",
" :param int max_length_out: maximum length of output to decide adaptive batch size\n",
" :param int min_batch_size: mininum batch size (for multi-gpu)\n",
" :param bool shortest_first: Sort from batch with shortest samples\n",
" to longest if true, otherwise reverse\n",
" :param str ikey: key to access input\n",
" (for ASR ikey=\"input\", for TTS, MT ikey=\"output\".)\n",
" :param int iaxis: dimension to access input\n",
" (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n",
" :param str okey: key to access output\n",
" (for ASR, MT okey=\"output\". for TTS okey=\"input\".)\n",
" :param int oaxis: dimension to access output\n",
" (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)\n",
" :return: List[List[Tuple[str, dict]]] list of batches\n",
" \"\"\"\n",
" if batch_size <= 0:\n",
" raise ValueError(f\"Invalid batch_size={batch_size}\")\n",
"\n",
" # check #utts is more than min_batch_size\n",
" if len(sorted_data) < min_batch_size:\n",
" raise ValueError(\n",
" f\"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size}).\"\n",
" )\n",
"\n",
" # make list of minibatches\n",
" minibatches = []\n",
" start = 0\n",
" while True:\n",
" _, info = sorted_data[start]\n",
" ilen = int(info[ikey][iaxis][\"shape\"][0])\n",
" olen = (int(info[okey][oaxis][\"shape\"][0]) if oaxis >= 0 else\n",
" max(map(lambda x: int(x[\"shape\"][0]), info[okey])))\n",
" factor = max(int(ilen / max_length_in), int(olen / max_length_out))\n",
" # change batchsize depending on the input and output length\n",
" # if ilen = 1000 and max_length_in = 800\n",
" # then b = batchsize / 2\n",
" # and max(min_batches, .) avoids batchsize = 0\n",
" bs = max(min_batch_size, int(batch_size / (1 + factor)))\n",
" end = min(len(sorted_data), start + bs)\n",
" minibatch = sorted_data[start:end]\n",
" if shortest_first:\n",
" minibatch.reverse()\n",
"\n",
" # check each batch is more than minimum batchsize\n",
" if len(minibatch) < min_batch_size:\n",
" mod = min_batch_size - len(minibatch) % min_batch_size\n",
" additional_minibatch = [\n",
" sorted_data[i] for i in np.random.randint(0, start, mod)\n",
" ]\n",
" if shortest_first:\n",
" additional_minibatch.reverse()\n",
" minibatch.extend(additional_minibatch)\n",
" minibatches.append(minibatch)\n",
"\n",
" if end == len(sorted_data):\n",
" break\n",
" start = end\n",
"\n",
" # batch: List[List[Tuple[str, dict]]]\n",
" return minibatches\n",
"\n",
"\n",
"def batchfy_by_bin(\n",
" sorted_data,\n",
" batch_bins,\n",
" num_batches=0,\n",
" min_batch_size=1,\n",
" shortest_first=False,\n",
" ikey=\"input\",\n",
" okey=\"output\", ):\n",
" \"\"\"Make variably sized batch set, which maximizes\n",
"\n",
" the number of bins up to `batch_bins`.\n",
"\n",
" :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n",
" :param int batch_bins: Maximum frames of a batch\n",
" :param int num_batches: # number of batches to use (for debug)\n",
" :param int min_batch_size: minimum batch size (for multi-gpu)\n",
" :param int test: Return only every `test` batches\n",
" :param bool shortest_first: Sort from batch with shortest samples\n",
" to longest if true, otherwise reverse\n",
"\n",
" :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n",
" :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n",
"\n",
" :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n",
" \"\"\"\n",
" if batch_bins <= 0:\n",
" raise ValueError(f\"invalid batch_bins={batch_bins}\")\n",
" length = len(sorted_data)\n",
" idim = int(sorted_data[0][1][ikey][0][\"shape\"][1])\n",
" odim = int(sorted_data[0][1][okey][0][\"shape\"][1])\n",
" logger.info(\"# utts: \" + str(len(sorted_data)))\n",
" minibatches = []\n",
" start = 0\n",
" n = 0\n",
" while True:\n",
" # Dynamic batch size depending on size of samples\n",
" b = 0\n",
" next_size = 0\n",
" max_olen = 0\n",
" while next_size < batch_bins and (start + b) < length:\n",
" ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0]) * idim\n",
" olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0]) * odim\n",
" if olen > max_olen:\n",
" max_olen = olen\n",
" next_size = (max_olen + ilen) * (b + 1)\n",
" if next_size <= batch_bins:\n",
" b += 1\n",
" elif next_size == 0:\n",
" raise ValueError(\n",
" f\"Can't fit one sample in batch_bins ({batch_bins}): \"\n",
" f\"Please increase the value\")\n",
" end = min(length, start + max(min_batch_size, b))\n",
" batch = sorted_data[start:end]\n",
" if shortest_first:\n",
" batch.reverse()\n",
" minibatches.append(batch)\n",
" # Check for min_batch_size and fixes the batches if needed\n",
" i = -1\n",
" while len(minibatches[i]) < min_batch_size:\n",
" missing = min_batch_size - len(minibatches[i])\n",
" if -i == len(minibatches):\n",
" minibatches[i + 1].extend(minibatches[i])\n",
" minibatches = minibatches[1:]\n",
" break\n",
" else:\n",
" minibatches[i].extend(minibatches[i - 1][:missing])\n",
" minibatches[i - 1] = minibatches[i - 1][missing:]\n",
" i -= 1\n",
" if end == length:\n",
" break\n",
" start = end\n",
" n += 1\n",
" if num_batches > 0:\n",
" minibatches = minibatches[:num_batches]\n",
" lengths = [len(x) for x in minibatches]\n",
" logger.info(\n",
" str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n",
" + \" to \" + str(max(lengths)) + \" samples \" + \"(avg \" + str(\n",
" int(np.mean(lengths))) + \" samples).\")\n",
" return minibatches\n",
"\n",
"\n",
"def batchfy_by_frame(\n",
" sorted_data,\n",
" max_frames_in,\n",
" max_frames_out,\n",
" max_frames_inout,\n",
" num_batches=0,\n",
" min_batch_size=1,\n",
" shortest_first=False,\n",
" ikey=\"input\",\n",
" okey=\"output\", ):\n",
" \"\"\"Make variable batch set, which maximizes the number of frames to max_batch_frame.\n",
"\n",
" :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json\n",
" :param int max_frames_in: Maximum input frames of a batch\n",
" :param int max_frames_out: Maximum output frames of a batch\n",
" :param int max_frames_inout: Maximum input+output frames of a batch\n",
" :param int num_batches: # number of batches to use (for debug)\n",
" :param int min_batch_size: minimum batch size (for multi-gpu)\n",
" :param int test: Return only every `test` batches\n",
" :param bool shortest_first: Sort from batch with shortest samples\n",
" to longest if true, otherwise reverse\n",
"\n",
" :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n",
" :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n",
"\n",
" :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n",
" \"\"\"\n",
" if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:\n",
" raise ValueError(\n",
" \"At least, one of `--batch-frames-in`, `--batch-frames-out` or \"\n",
" \"`--batch-frames-inout` should be > 0\")\n",
" length = len(sorted_data)\n",
" minibatches = []\n",
" start = 0\n",
" end = 0\n",
" while end != length:\n",
" # Dynamic batch size depending on size of samples\n",
" b = 0\n",
" max_olen = 0\n",
" max_ilen = 0\n",
" while (start + b) < length:\n",
" ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0])\n",
" if ilen > max_frames_in and max_frames_in != 0:\n",
" raise ValueError(\n",
" f\"Can't fit one sample in --batch-frames-in ({max_frames_in}): \"\n",
" f\"Please increase the value\")\n",
" olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0])\n",
" if olen > max_frames_out and max_frames_out != 0:\n",
" raise ValueError(\n",
" f\"Can't fit one sample in --batch-frames-out ({max_frames_out}): \"\n",
" f\"Please increase the value\")\n",
" if ilen + olen > max_frames_inout and max_frames_inout != 0:\n",
" raise ValueError(\n",
" f\"Can't fit one sample in --batch-frames-out ({max_frames_inout}): \"\n",
" f\"Please increase the value\")\n",
" max_olen = max(max_olen, olen)\n",
" max_ilen = max(max_ilen, ilen)\n",
" in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0\n",
" out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0\n",
" inout_ok = (max_ilen + max_olen) * (\n",
" b + 1) <= max_frames_inout or max_frames_inout == 0\n",
" if in_ok and out_ok and inout_ok:\n",
" # add more seq in the minibatch\n",
" b += 1\n",
" else:\n",
" # no more seq in the minibatch\n",
" break\n",
" end = min(length, start + b)\n",
" batch = sorted_data[start:end]\n",
" if shortest_first:\n",
" batch.reverse()\n",
" minibatches.append(batch)\n",
" # Check for min_batch_size and fixes the batches if needed\n",
" i = -1\n",
" while len(minibatches[i]) < min_batch_size:\n",
" missing = min_batch_size - len(minibatches[i])\n",
" if -i == len(minibatches):\n",
" minibatches[i + 1].extend(minibatches[i])\n",
" minibatches = minibatches[1:]\n",
" break\n",
" else:\n",
" minibatches[i].extend(minibatches[i - 1][:missing])\n",
" minibatches[i - 1] = minibatches[i - 1][missing:]\n",
" i -= 1\n",
" start = end\n",
" if num_batches > 0:\n",
" minibatches = minibatches[:num_batches]\n",
" lengths = [len(x) for x in minibatches]\n",
" logger.info(\n",
" str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n",
" + \" to \" + str(max(lengths)) + \" samples\" + \"(avg \" + str(\n",
" int(np.mean(lengths))) + \" samples).\")\n",
"\n",
" return minibatches\n",
"\n",
"\n",
"def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,\n",
" shortest_first):\n",
" import random\n",
"\n",
" logger.info(\"use shuffled batch.\")\n",
" sorted_data = random.sample(data.items(), len(data.items()))\n",
" logger.info(\"# utts: \" + str(len(sorted_data)))\n",
" # make list of minibatches\n",
" minibatches = []\n",
" start = 0\n",
" while True:\n",
" end = min(len(sorted_data), start + batch_size)\n",
" # check each batch is more than minimum batchsize\n",
" minibatch = sorted_data[start:end]\n",
" if shortest_first:\n",
" minibatch.reverse()\n",
" if len(minibatch) < min_batch_size:\n",
" mod = min_batch_size - len(minibatch) % min_batch_size\n",
" additional_minibatch = [\n",
" sorted_data[i] for i in np.random.randint(0, start, mod)\n",
" ]\n",
" if shortest_first:\n",
" additional_minibatch.reverse()\n",
" minibatch.extend(additional_minibatch)\n",
" minibatches.append(minibatch)\n",
" if end == len(sorted_data):\n",
" break\n",
" start = end\n",
"\n",
" # for debugging\n",
" if num_batches > 0:\n",
" minibatches = minibatches[:num_batches]\n",
" logger.info(\"# minibatches: \" + str(len(minibatches)))\n",
" return minibatches\n",
"\n",
"\n",
"BATCH_COUNT_CHOICES = [\"auto\", \"seq\", \"bin\", \"frame\"]\n",
"BATCH_SORT_KEY_CHOICES = [\"input\", \"output\", \"shuffle\"]\n",
"\n",
"\n",
"def make_batchset(\n",
" data,\n",
" batch_size=0,\n",
" max_length_in=float(\"inf\"),\n",
" max_length_out=float(\"inf\"),\n",
" num_batches=0,\n",
" min_batch_size=1,\n",
" shortest_first=False,\n",
" batch_sort_key=\"input\",\n",
" count=\"auto\",\n",
" batch_bins=0,\n",
" batch_frames_in=0,\n",
" batch_frames_out=0,\n",
" batch_frames_inout=0,\n",
" iaxis=0,\n",
" oaxis=0, ):\n",
" \"\"\"Make batch set from json dictionary\n",
"\n",
" if utts have \"category\" value,\n",
"\n",
" >>> data = {'utt1': {'category': 'A', 'input': ...},\n",
" ... 'utt2': {'category': 'B', 'input': ...},\n",
" ... 'utt3': {'category': 'B', 'input': ...},\n",
" ... 'utt4': {'category': 'A', 'input': ...}}\n",
" >>> make_batchset(data, batchsize=2, ...)\n",
" [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]\n",
"\n",
" Note that if any utts doesn't have \"category\",\n",
" perform as same as batchfy_by_{count}\n",
"\n",
" :param List[Dict[str, Any]] data: dictionary loaded from data.json\n",
" :param int batch_size: maximum number of sequences in a minibatch.\n",
" :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.\n",
" :param int batch_frames_in: maximum number of input frames in a minibatch.\n",
" :param int batch_frames_out: maximum number of output frames in a minibatch.\n",
" :param int batch_frames_out: maximum number of input+output frames in a minibatch.\n",
" :param str count: strategy to count maximum size of batch.\n",
" For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES\n",
"\n",
" :param int max_length_in: maximum length of input to decide adaptive batch size\n",
" :param int max_length_out: maximum length of output to decide adaptive batch size\n",
" :param int num_batches: # number of batches to use (for debug)\n",
" :param int min_batch_size: minimum batch size (for multi-gpu)\n",
" :param bool shortest_first: Sort from batch with shortest samples\n",
" to longest if true, otherwise reverse\n",
" :param str batch_sort_key: how to sort data before creating minibatches\n",
" [\"input\", \"output\", \"shuffle\"]\n",
" :param bool swap_io: if True, use \"input\" as output and \"output\"\n",
" as input in `data` dict\n",
" :param bool mt: if True, use 0-axis of \"output\" as output and 1-axis of \"output\"\n",
" as input in `data` dict\n",
" :param int iaxis: dimension to access input\n",
" (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n",
" :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,\n",
" reserved for future research, -1 means all axis.)\n",
" :return: List[List[Tuple[str, dict]]] list of batches\n",
" \"\"\"\n",
"\n",
" # check args\n",
" if count not in BATCH_COUNT_CHOICES:\n",
" raise ValueError(\n",
" f\"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}\")\n",
" if batch_sort_key not in BATCH_SORT_KEY_CHOICES:\n",
" raise ValueError(f\"arg 'batch_sort_key' ({batch_sort_key}) should be \"\n",
" f\"one of {BATCH_SORT_KEY_CHOICES}\")\n",
"\n",
" ikey = \"input\"\n",
" okey = \"output\"\n",
" batch_sort_axis = 0 # index of list \n",
"\n",
" if count == \"auto\":\n",
" if batch_size != 0:\n",
" count = \"seq\"\n",
" elif batch_bins != 0:\n",
" count = \"bin\"\n",
" elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:\n",
" count = \"frame\"\n",
" else:\n",
" raise ValueError(\n",
" f\"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}\"\n",
" )\n",
" logger.info(f\"count is auto detected as {count}\")\n",
"\n",
" if count != \"seq\" and batch_sort_key == \"shuffle\":\n",
" raise ValueError(\n",
" \"batch_sort_key=shuffle is only available if batch_count=seq\")\n",
"\n",
" category2data = {} # Dict[str, dict]\n",
" for v in data:\n",
" k = v['utt']\n",
" category2data.setdefault(v.get(\"category\"), {})[k] = v\n",
"\n",
" batches_list = [] # List[List[List[Tuple[str, dict]]]]\n",
" for d in category2data.values():\n",
" if batch_sort_key == \"shuffle\":\n",
" batches = batchfy_shuffle(d, batch_size, min_batch_size,\n",
" num_batches, shortest_first)\n",
" batches_list.append(batches)\n",
" continue\n",
"\n",
" # sort it by input lengths (long to short)\n",
" sorted_data = sorted(\n",
" d.items(),\n",
" key=lambda data: int(data[1][batch_sort_key][batch_sort_axis][\"shape\"][0]),\n",
" reverse=not shortest_first, )\n",
" logger.info(\"# utts: \" + str(len(sorted_data)))\n",
" \n",
" if count == \"seq\":\n",
" batches = batchfy_by_seq(\n",
" sorted_data,\n",
" batch_size=batch_size,\n",
" max_length_in=max_length_in,\n",
" max_length_out=max_length_out,\n",
" min_batch_size=min_batch_size,\n",
" shortest_first=shortest_first,\n",
" ikey=ikey,\n",
" iaxis=iaxis,\n",
" okey=okey,\n",
" oaxis=oaxis, )\n",
" if count == \"bin\":\n",
" batches = batchfy_by_bin(\n",
" sorted_data,\n",
" batch_bins=batch_bins,\n",
" min_batch_size=min_batch_size,\n",
" shortest_first=shortest_first,\n",
" ikey=ikey,\n",
" okey=okey, )\n",
" if count == \"frame\":\n",
" batches = batchfy_by_frame(\n",
" sorted_data,\n",
" max_frames_in=batch_frames_in,\n",
" max_frames_out=batch_frames_out,\n",
" max_frames_inout=batch_frames_inout,\n",
" min_batch_size=min_batch_size,\n",
" shortest_first=shortest_first,\n",
" ikey=ikey,\n",
" okey=okey, )\n",
" batches_list.append(batches)\n",
"\n",
" if len(batches_list) == 1:\n",
" batches = batches_list[0]\n",
" else:\n",
" # Concat list. This way is faster than \"sum(batch_list, [])\"\n",
" batches = list(itertools.chain(*batches_list))\n",
"\n",
" # for debugging\n",
" if num_batches > 0:\n",
" batches = batches[:num_batches]\n",
" logger.info(\"# minibatches: \" + str(len(batches)))\n",
"\n",
" # batch: List[List[Tuple[str, dict]]]\n",
" return batches\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "acquired-hurricane",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:284] use shuffled batch.\n",
"[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:286] # utts: 5542\n",
"[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:467] # minibatches: 555\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"555\n"
]
}
],
"source": [
"batch_size=10\n",
"maxlen_in=300\n",
"maxlen_out=400\n",
"minibatches=0 # for debug\n",
"min_batch_size=2\n",
"use_sortagrad=True\n",
"batch_count='seq'\n",
"batch_bins=0\n",
"batch_frames_in=3000\n",
"batch_frames_out=0\n",
"batch_frames_inout=0\n",
" \n",
"dev_data = make_batchset(\n",
" dev_json,\n",
" batch_size,\n",
" maxlen_in,\n",
" maxlen_out,\n",
" minibatches, # for debug\n",
" min_batch_size=min_batch_size,\n",
" shortest_first=use_sortagrad,\n",
" batch_sort_key=\"shuffle\",\n",
" count=batch_count,\n",
" batch_bins=batch_bins,\n",
" batch_frames_in=batch_frames_in,\n",
" batch_frames_out=batch_frames_out,\n",
" batch_frames_inout=batch_frames_inout,\n",
" iaxis=0,\n",
" oaxis=0, )\n",
"print(len(dev_data))\n",
"# for i in range(len(dev_data)):\n",
"# print(len(dev_data[i]))\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "warming-malpractice",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting kaldiio\n",
" Downloading kaldiio-2.17.2.tar.gz (24 kB)\n",
"Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n",
"Building wheels for collected packages: kaldiio\n",
" Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n",
" Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n",
"Successfully built kaldiio\n",
"Installing collected packages: kaldiio\n",
"Successfully installed kaldiio-2.17.2\n",
"\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n",
"You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"!pip install kaldiio"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "equipped-subject",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 54,
"id": "superb-methodology",
"metadata": {},
"outputs": [],
"source": [
"from collections import OrderedDict\n",
"import kaldiio\n",
"\n",
"class LoadInputsAndTargets():\n",
" \"\"\"Create a mini-batch from a list of dicts\n",
"\n",
" >>> batch = [('utt1',\n",
" ... dict(input=[dict(feat='some.ark:123',\n",
" ... filetype='mat',\n",
" ... name='input1',\n",
" ... shape=[100, 80])],\n",
" ... output=[dict(tokenid='1 2 3 4',\n",
" ... name='target1',\n",
" ... shape=[4, 31])]]))\n",
" >>> l = LoadInputsAndTargets()\n",
" >>> feat, target = l(batch)\n",
"\n",
" :param: str mode: Specify the task mode, \"asr\" or \"tts\"\n",
" :param: str preprocess_conf: The path of a json file for pre-processing\n",
" :param: bool load_input: If False, not to load the input data\n",
" :param: bool load_output: If False, not to load the output data\n",
" :param: bool sort_in_input_length: Sort the mini-batch in descending order\n",
" of the input length\n",
" :param: bool use_speaker_embedding: Used for tts mode only\n",
" :param: bool use_second_target: Used for tts mode only\n",
" :param: dict preprocess_args: Set some optional arguments for preprocessing\n",
" :param: Optional[dict] preprocess_args: Used for tts mode only\n",
" \"\"\"\n",
"\n",
" def __init__(\n",
" self,\n",
" mode=\"asr\",\n",
" preprocess_conf=None,\n",
" load_input=True,\n",
" load_output=True,\n",
" sort_in_input_length=True,\n",
" preprocess_args=None,\n",
" keep_all_data_on_mem=False, ):\n",
" self._loaders = {}\n",
"\n",
" if mode not in [\"asr\"]:\n",
" raise ValueError(\"Only asr are allowed: mode={}\".format(mode))\n",
"\n",
" if preprocess_conf is not None:\n",
" self.preprocessing = AugmentationPipeline(preprocess_conf)\n",
" logging.warning(\n",
" \"[Experimental feature] Some preprocessing will be done \"\n",
" \"for the mini-batch creation using {}\".format(\n",
" self.preprocessing))\n",
" else:\n",
" # If conf doesn't exist, this function don't touch anything.\n",
" self.preprocessing = None\n",
"\n",
" self.mode = mode\n",
" self.load_output = load_output\n",
" self.load_input = load_input\n",
" self.sort_in_input_length = sort_in_input_length\n",
" if preprocess_args is None:\n",
" self.preprocess_args = {}\n",
" else:\n",
" assert isinstance(preprocess_args, dict), type(preprocess_args)\n",
" self.preprocess_args = dict(preprocess_args)\n",
"\n",
" self.keep_all_data_on_mem = keep_all_data_on_mem\n",
"\n",
" def __call__(self, batch, return_uttid=False):\n",
" \"\"\"Function to load inputs and targets from list of dicts\n",
"\n",
" :param List[Tuple[str, dict]] batch: list of dict which is subset of\n",
" loaded data.json\n",
" :param bool return_uttid: return utterance ID information for visualization\n",
" :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]\n",
" :return: list of input feature sequences\n",
" [(T_1, D), (T_2, D), ..., (T_B, D)]\n",
" :rtype: list of float ndarray\n",
" :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]\n",
" :rtype: list of int ndarray\n",
"\n",
" \"\"\"\n",
" x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n",
" y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n",
" uttid_list = [] # List[str]\n",
"\n",
" for uttid, info in batch:\n",
" uttid_list.append(uttid)\n",
"\n",
" if self.load_input:\n",
" # Note(kamo): This for-loop is for multiple inputs\n",
" for idx, inp in enumerate(info[\"input\"]):\n",
" # {\"input\":\n",
" # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"hdf5\",\n",
" # \"name\": \"input1\", ...}], ...}\n",
" x = self._get_from_loader(\n",
" filepath=inp[\"feat\"],\n",
" filetype=inp.get(\"filetype\", \"mat\"))\n",
" x_feats_dict.setdefault(inp[\"name\"], []).append(x)\n",
"\n",
" if self.load_output:\n",
" for idx, inp in enumerate(info[\"output\"]):\n",
" if \"tokenid\" in inp:\n",
" # ======= Legacy format for output =======\n",
" # {\"output\": [{\"tokenid\": \"1 2 3 4\"}])\n",
" x = np.fromiter(\n",
" map(int, inp[\"tokenid\"].split()), dtype=np.int64)\n",
" else:\n",
" # ======= New format =======\n",
" # {\"input\":\n",
" # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"hdf5\",\n",
" # \"name\": \"target1\", ...}], ...}\n",
" x = self._get_from_loader(\n",
" filepath=inp[\"feat\"],\n",
" filetype=inp.get(\"filetype\", \"mat\"))\n",
"\n",
" y_feats_dict.setdefault(inp[\"name\"], []).append(x)\n",
"\n",
" if self.mode == \"asr\":\n",
" return_batch, uttid_list = self._create_batch_asr(\n",
" x_feats_dict, y_feats_dict, uttid_list)\n",
" else:\n",
" raise NotImplementedError(self.mode)\n",
"\n",
" if self.preprocessing is not None:\n",
" # Apply pre-processing all input features\n",
" for x_name in return_batch.keys():\n",
" if x_name.startswith(\"input\"):\n",
" return_batch[x_name] = self.preprocessing(\n",
" return_batch[x_name], uttid_list,\n",
" **self.preprocess_args)\n",
"\n",
" if return_uttid:\n",
" return tuple(return_batch.values()), uttid_list\n",
"\n",
" # Doesn't return the names now.\n",
" return tuple(return_batch.values())\n",
"\n",
" def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):\n",
" \"\"\"Create a OrderedDict for the mini-batch\n",
"\n",
" :param OrderedDict x_feats_dict:\n",
" e.g. {\"input1\": [ndarray, ndarray, ...],\n",
" \"input2\": [ndarray, ndarray, ...]}\n",
" :param OrderedDict y_feats_dict:\n",
" e.g. {\"target1\": [ndarray, ndarray, ...],\n",
" \"target2\": [ndarray, ndarray, ...]}\n",
" :param: List[str] uttid_list:\n",
" Give uttid_list to sort in the same order as the mini-batch\n",
" :return: batch, uttid_list\n",
" :rtype: Tuple[OrderedDict, List[str]]\n",
" \"\"\"\n",
" # handle single-input and multi-input (paralell) asr mode\n",
" xs = list(x_feats_dict.values())\n",
"\n",
" if self.load_output:\n",
" ys = list(y_feats_dict.values())\n",
" assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))\n",
"\n",
" # get index of non-zero length samples\n",
" nonzero_idx = list(\n",
" filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))\n",
" for n in range(1, len(y_feats_dict)):\n",
" nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)\n",
" else:\n",
" # Note(kamo): Be careful not to make nonzero_idx to a generator\n",
" nonzero_idx = list(range(len(xs[0])))\n",
"\n",
" if self.sort_in_input_length:\n",
" # sort in input lengths based on the first input\n",
" nonzero_sorted_idx = sorted(\n",
" nonzero_idx, key=lambda i: -len(xs[0][i]))\n",
" else:\n",
" nonzero_sorted_idx = nonzero_idx\n",
"\n",
" if len(nonzero_sorted_idx) != len(xs[0]):\n",
" logging.warning(\n",
" \"Target sequences include empty tokenid (batch {} -> {}).\".\n",
" format(len(xs[0]), len(nonzero_sorted_idx)))\n",
"\n",
" # remove zero-length samples\n",
" xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]\n",
" uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]\n",
"\n",
" x_names = list(x_feats_dict.keys())\n",
" if self.load_output:\n",
" ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]\n",
" y_names = list(y_feats_dict.keys())\n",
"\n",
" # Keeping x_name and y_name, e.g. input1, for future extension\n",
" return_batch = OrderedDict([\n",
" * [(x_name, x) for x_name, x in zip(x_names, xs)],\n",
" * [(y_name, y) for y_name, y in zip(y_names, ys)],\n",
" ])\n",
" else:\n",
" return_batch = OrderedDict(\n",
" [(x_name, x) for x_name, x in zip(x_names, xs)])\n",
" return return_batch, uttid_list\n",
"\n",
" def _get_from_loader(self, filepath, filetype):\n",
" \"\"\"Return ndarray\n",
"\n",
" In order to make the fds to be opened only at the first referring,\n",
" the loader are stored in self._loaders\n",
"\n",
" >>> ndarray = loader.get_from_loader(\n",
" ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')\n",
"\n",
" :param: str filepath:\n",
" :param: str filetype:\n",
" :return:\n",
" :rtype: np.ndarray\n",
" \"\"\"\n",
" if filetype == \"hdf5\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"hdf5\",\n",
" # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n",
" filepath, key = filepath.split(\":\", 1)\n",
"\n",
" loader = self._loaders.get(filepath)\n",
" if loader is None:\n",
" # To avoid disk access, create loader only for the first time\n",
" loader = h5py.File(filepath, \"r\")\n",
" self._loaders[filepath] = loader\n",
" return loader[key][()]\n",
" elif filetype == \"sound.hdf5\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"sound.hdf5\",\n",
" # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n",
" filepath, key = filepath.split(\":\", 1)\n",
"\n",
" loader = self._loaders.get(filepath)\n",
" if loader is None:\n",
" # To avoid disk access, create loader only for the first time\n",
" loader = SoundHDF5File(filepath, \"r\", dtype=\"int16\")\n",
" self._loaders[filepath] = loader\n",
" array, rate = loader[key]\n",
" return array\n",
" elif filetype == \"sound\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.wav\",\n",
" # \"filetype\": \"sound\"},\n",
" # Assume PCM16\n",
" if not self.keep_all_data_on_mem:\n",
" array, _ = soundfile.read(filepath, dtype=\"int16\")\n",
" return array\n",
" if filepath not in self._loaders:\n",
" array, _ = soundfile.read(filepath, dtype=\"int16\")\n",
" self._loaders[filepath] = array\n",
" return self._loaders[filepath]\n",
" elif filetype == \"npz\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.npz:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"npz\",\n",
" filepath, key = filepath.split(\":\", 1)\n",
"\n",
" loader = self._loaders.get(filepath)\n",
" if loader is None:\n",
" # To avoid disk access, create loader only for the first time\n",
" loader = np.load(filepath)\n",
" self._loaders[filepath] = loader\n",
" return loader[key]\n",
" elif filetype == \"npy\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.npy\",\n",
" # \"filetype\": \"npy\"},\n",
" if not self.keep_all_data_on_mem:\n",
" return np.load(filepath)\n",
" if filepath not in self._loaders:\n",
" self._loaders[filepath] = np.load(filepath)\n",
" return self._loaders[filepath]\n",
" elif filetype in [\"mat\", \"vec\"]:\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.ark:123\",\n",
" # \"filetype\": \"mat\"}]},\n",
" # In this case, \"123\" indicates the starting points of the matrix\n",
" # load_mat can load both matrix and vector\n",
" if not self.keep_all_data_on_mem:\n",
" return kaldiio.load_mat(filepath)\n",
" if filepath not in self._loaders:\n",
" self._loaders[filepath] = kaldiio.load_mat(filepath)\n",
" return self._loaders[filepath]\n",
" elif filetype == \"scp\":\n",
" # e.g.\n",
" # {\"input\": [{\"feat\": \"some/path.scp:F01_050C0101_PED_REAL\",\n",
" # \"filetype\": \"scp\",\n",
" filepath, key = filepath.split(\":\", 1)\n",
" loader = self._loaders.get(filepath)\n",
" if loader is None:\n",
" # To avoid disk access, create loader only for the first time\n",
" loader = kaldiio.load_scp(filepath)\n",
" self._loaders[filepath] = loader\n",
" return loader[key]\n",
" else:\n",
" raise NotImplementedError(\n",
" \"Not supported: loader_type={}\".format(filetype))\n"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "monthly-muscle",
"metadata": {},
"outputs": [],
"source": [
"preprocess_conf=None\n",
"train_mode=True\n",
"load = LoadInputsAndTargets(\n",
" mode=\"asr\",\n",
" load_output=True,\n",
" preprocess_conf=preprocess_conf,\n",
" preprocess_args={\"train\":\n",
" train_mode}, # Switch the mode of preprocessing\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "periodic-senegal",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-56-9f483b231463>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m 94\u001b[0m x = self._get_from_loader(\n\u001b[1;32m 95\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 281\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'"
]
}
],
"source": [
"res = load(dev_data[0])"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "humanitarian-container",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n"
]
}
],
"source": [
"!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "heard-prize",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ls: cannot access '/workspace/espnet/': No such file or directory\r\n"
]
}
],
"source": [
"!ls /workspace/espnet/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "convinced-animation",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
......@@ -347,7 +347,7 @@ def make_batchset(
Note that if any utts doesn't have "category",
perform as same as batchfy_by_{count}
:param Dict[str, Dict[str, Any]] data: dictionary loaded from data.json
:param List[Dict[str, Any]] data: dictionary loaded from data.json
:param int batch_size: maximum number of sequences in a minibatch.
:param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
:param int batch_frames_in: maximum number of input frames in a minibatch.
......@@ -374,7 +374,6 @@ def make_batchset(
reserved for future research, -1 means all axis.)
:return: List[List[Tuple[str, dict]]] list of batches
"""
# check args
if count not in BATCH_COUNT_CHOICES:
raise ValueError(
......@@ -386,7 +385,6 @@ def make_batchset(
ikey = "input"
okey = "output"
batch_sort_axis = 0 # index of list
if count == "auto":
if batch_size != 0:
count = "seq"
......@@ -405,7 +403,8 @@ def make_batchset(
"batch_sort_key=shuffle is only available if batch_count=seq")
category2data = {} # Dict[str, dict]
for k, v in data.items():
for v in data:
k = v['utt']
category2data.setdefault(v.get("category"), {})[k] = v
batches_list = [] # List[List[List[Tuple[str, dict]]]]
......@@ -422,6 +421,7 @@ def make_batchset(
key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
reverse=not shortest_first, )
logger.info("# utts: " + str(len(sorted_data)))
if count == "seq":
batches = batchfy_by_seq(
sorted_data,
......
......@@ -16,7 +16,7 @@ from typing import Optional
from paddle.io import Dataset
from yacs.config import CfgNode
from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.log import Log
__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册