295 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			295 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding=utf-8
 | 
						|
# Copyright 2021 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
 | 
						|
# Lint as: python3
 | 
						|
"""Librispeech automatic speech recognition dataset."""
 | 
						|
 | 
						|
#
 | 
						|
# Copyright 2016 The BigDL Authors.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
#
 | 
						|
 | 
						|
import os
 | 
						|
 | 
						|
import datasets
 | 
						|
from datasets.tasks import AutomaticSpeechRecognition
 | 
						|
 | 
						|
 | 
						|
_CITATION = """\
 | 
						|
@inproceedings{panayotov2015librispeech,
 | 
						|
  title={Librispeech: an ASR corpus based on public domain audio books},
 | 
						|
  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
 | 
						|
  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
 | 
						|
  pages={5206--5210},
 | 
						|
  year={2015},
 | 
						|
  organization={IEEE}
 | 
						|
}
 | 
						|
"""
 | 
						|
 | 
						|
_DESCRIPTION = """\
 | 
						|
LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
 | 
						|
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
 | 
						|
audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
 | 
						|
"""
 | 
						|
 | 
						|
_URL = "http://www.openslr.org/12"
 | 
						|
#_DL_URL = "http://www.openslr.org/resources/12/"
 | 
						|
_DL_URL = "./librispeech/"
 | 
						|
 | 
						|
 | 
						|
_DL_URLS = {
 | 
						|
    "clean": {
 | 
						|
        "dev": _DL_URL + "dev-clean.tar.gz",
 | 
						|
        "test": _DL_URL + "test-clean.tar.gz",
 | 
						|
        "train.100": _DL_URL + "train-clean-100.tar.gz",
 | 
						|
        "train.360": _DL_URL + "train-clean-360.tar.gz",
 | 
						|
    },
 | 
						|
    "other": {
 | 
						|
        "test": _DL_URL + "test-other.tar.gz",
 | 
						|
        "dev": _DL_URL + "dev-other.tar.gz",
 | 
						|
        "train.500": _DL_URL + "train-other-500.tar.gz",
 | 
						|
    },
 | 
						|
    "all": {
 | 
						|
        "dev.clean": _DL_URL + "dev-clean.tar.gz",
 | 
						|
        "dev.other": _DL_URL + "dev-other.tar.gz",
 | 
						|
        "test.clean": _DL_URL + "test-clean.tar.gz",
 | 
						|
        "test.other": _DL_URL + "test-other.tar.gz",
 | 
						|
        #"train.clean.100": _DL_URL + "train-clean-100.tar.gz",
 | 
						|
        #"train.clean.360": _DL_URL + "train-clean-360.tar.gz",
 | 
						|
        #"train.other.500": _DL_URL + "train-other-500.tar.gz",
 | 
						|
    },
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
class LibrispeechASRConfig(datasets.BuilderConfig):
 | 
						|
    """BuilderConfig for LibriSpeechASR."""
 | 
						|
 | 
						|
    def __init__(self, **kwargs):
 | 
						|
        """
 | 
						|
        Args:
 | 
						|
          data_dir: `string`, the path to the folder containing the files in the
 | 
						|
            downloaded .tar
 | 
						|
          citation: `string`, citation for the data set
 | 
						|
          url: `string`, url for information about the data set
 | 
						|
          **kwargs: keyword arguments forwarded to super.
 | 
						|
        """
 | 
						|
        super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
 | 
						|
 | 
						|
 | 
						|
class LibrispeechASR(datasets.GeneratorBasedBuilder):
 | 
						|
    """Librispeech dataset."""
 | 
						|
 | 
						|
    DEFAULT_WRITER_BATCH_SIZE = 256
 | 
						|
    DEFAULT_CONFIG_NAME = "all"
 | 
						|
    BUILDER_CONFIGS = [
 | 
						|
        LibrispeechASRConfig(name="clean", description="'Clean' speech."),
 | 
						|
        LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."),
 | 
						|
        LibrispeechASRConfig(name="all", description="Combined clean and other dataset."),
 | 
						|
    ]
 | 
						|
 | 
						|
    def _info(self):
 | 
						|
        return datasets.DatasetInfo(
 | 
						|
            description=_DESCRIPTION,
 | 
						|
            features=datasets.Features(
 | 
						|
                {
 | 
						|
                    "file": datasets.Value("string"),
 | 
						|
                    "audio": datasets.Audio(sampling_rate=16_000),
 | 
						|
                    "text": datasets.Value("string"),
 | 
						|
                    "speaker_id": datasets.Value("int64"),
 | 
						|
                    "chapter_id": datasets.Value("int64"),
 | 
						|
                    "id": datasets.Value("string"),
 | 
						|
                }
 | 
						|
            ),
 | 
						|
            supervised_keys=("file", "text"),
 | 
						|
            homepage=_URL,
 | 
						|
            citation=_CITATION,
 | 
						|
            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
 | 
						|
        )
 | 
						|
 | 
						|
    def _split_generators(self, dl_manager):
 | 
						|
        archive_path = dl_manager.download(_DL_URLS[self.config.name])
 | 
						|
        # (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
 | 
						|
        local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}
 | 
						|
 | 
						|
        if self.config.name == "clean":
 | 
						|
            train_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="train.100",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("train.100"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["train.100"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="train.360",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("train.360"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["train.360"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
            ]
 | 
						|
            dev_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name=datasets.Split.VALIDATION,
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("dev"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["dev"]),
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            ]
 | 
						|
            test_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name=datasets.Split.TEST,
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("test"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["test"]),
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            ]
 | 
						|
        elif self.config.name == "other":
 | 
						|
            train_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="train.500",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("train.500"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["train.500"]),
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            ]
 | 
						|
            dev_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name=datasets.Split.VALIDATION,
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("dev"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["dev"]),
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            ]
 | 
						|
            test_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name=datasets.Split.TEST,
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("test"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["test"]),
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            ]
 | 
						|
        elif self.config.name == "all":
 | 
						|
            #train_splits = [
 | 
						|
            #    datasets.SplitGenerator(
 | 
						|
            #        name="train.clean.100",
 | 
						|
            #        gen_kwargs={
 | 
						|
            #            "local_extracted_archive": local_extracted_archive.get("train.clean.100"),
 | 
						|
            #            "files": dl_manager.iter_archive(archive_path["train.clean.100"]),
 | 
						|
            #        },
 | 
						|
            #    ),
 | 
						|
            #    datasets.SplitGenerator(
 | 
						|
            #        name="train.clean.360",
 | 
						|
            #        gen_kwargs={
 | 
						|
            #            "local_extracted_archive": local_extracted_archive.get("train.clean.360"),
 | 
						|
            #            "files": dl_manager.iter_archive(archive_path["train.clean.360"]),
 | 
						|
            #        },
 | 
						|
            #    ),
 | 
						|
            #    datasets.SplitGenerator(
 | 
						|
            #        name="train.other.500",
 | 
						|
            #        gen_kwargs={
 | 
						|
            #            "local_extracted_archive": local_extracted_archive.get("train.other.500"),
 | 
						|
            #            "files": dl_manager.iter_archive(archive_path["train.other.500"]),
 | 
						|
            #        },
 | 
						|
            #    ),
 | 
						|
            #]
 | 
						|
            dev_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="validation.clean",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("dev.clean"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["dev.clean"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="validation.other",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("dev.other"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["dev.other"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
            ]
 | 
						|
            test_splits = [
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="test.clean",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("test.clean"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["test.clean"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
                datasets.SplitGenerator(
 | 
						|
                    name="test.other",
 | 
						|
                    gen_kwargs={
 | 
						|
                        "local_extracted_archive": local_extracted_archive.get("test.other"),
 | 
						|
                        "files": dl_manager.iter_archive(archive_path["test.other"]),
 | 
						|
                    },
 | 
						|
                ),
 | 
						|
            ]
 | 
						|
 | 
						|
        return train_splits + dev_splits + test_splits
 | 
						|
 | 
						|
    def _generate_examples(self, files, local_extracted_archive):
 | 
						|
        """Generate examples from a LibriSpeech archive_path."""
 | 
						|
        key = 0
 | 
						|
        audio_data = {}
 | 
						|
        transcripts = []
 | 
						|
        for path, f in files:
 | 
						|
            if path.endswith(".flac"):
 | 
						|
                id_ = path.split("/")[-1][: -len(".flac")]
 | 
						|
                audio_data[id_] = f.read()
 | 
						|
            elif path.endswith(".trans.txt"):
 | 
						|
                for line in f:
 | 
						|
                    if line:
 | 
						|
                        line = line.decode("utf-8").strip()
 | 
						|
                        id_, transcript = line.split(" ", 1)
 | 
						|
                        audio_file = f"{id_}.flac"
 | 
						|
                        speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
 | 
						|
                        audio_file = (
 | 
						|
                            os.path.join(local_extracted_archive, audio_file)
 | 
						|
                            if local_extracted_archive
 | 
						|
                            else audio_file
 | 
						|
                        )
 | 
						|
                        transcripts.append(
 | 
						|
                            {
 | 
						|
                                "id": id_,
 | 
						|
                                "speaker_id": speaker_id,
 | 
						|
                                "chapter_id": chapter_id,
 | 
						|
                                "file": audio_file,
 | 
						|
                                "text": transcript,
 | 
						|
                            }
 | 
						|
                        )
 | 
						|
            if audio_data and len(audio_data) == len(transcripts):
 | 
						|
                for transcript in transcripts:
 | 
						|
                    audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
 | 
						|
                    yield key, {"audio": audio, **transcript}
 | 
						|
                    key += 1
 | 
						|
                audio_data = {}
 | 
						|
                transcripts = []
 |