Source code for forte.datasets.mrc.squad_reader

# Copyright 2019 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
from typing import Any, Iterator, Dict, Set, Tuple

from forte.data.data_pack import DataPack
from forte.data.base_reader import PackReader
from ft.onto.base_ontology import Document, MRCQuestion, Phrase
from ftx.onto.race_qa import Passage

__all__ = [
    "SquadReader",
]


[docs]class SquadReader(PackReader):
    r"""Reader for processing Stanford Question Answering Dataset (SQuAD).

    Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
    consisting of questions posed by crowdworkers on a set of Wikipedia articles,
    where the answer to every question is a segment of text, or span.

    Dataset can be downloaded at https://rajpurkar.github.io/SQuAD-explorer/.

    SquadReader reads each paragraph in the dataset as a separate Document, and the questions
    are concatenated behind the paragraph, form a Passage.
    Phrase are MRC answers marked as text spans. Each MRCQuestion has a list of answers.
    """

    def _collect(self, file_path: str) -> Iterator[Any]:  # type: ignore
        r"""Given file_path to the dataset, return an iterator to every data point in it.

        Args:
            file_path: path to the JSON file

        Returns: QA pairs and the context of a paragraph of a passage in SQuAD dataset.
        """
        with open(file_path, "r", encoding="utf8", errors="ignore") as file:
            jsonf = json.load(file)
            for dic in jsonf["data"]:
                title = dic["title"]
                cnt = 0
                for qa_dic in dic["paragraphs"]:
                    yield title + str(cnt), qa_dic["qas"], qa_dic["context"]
                    cnt += 1

    def _cache_key_function(self, text_file: str) -> str:
        return os.path.basename(text_file)

    def _parse_pack(self, qa_dict: Tuple[str, list, str]) -> Iterator[DataPack]:
        title, qas, context = qa_dict
        context_end = len(context)
        offset = context_end + 1
        text = context

        pack = DataPack()  # one datapack for a context
        for qa in qas:
            if qa["is_impossible"] is True:
                continue
            ques_text = qa["question"]
            ans = qa["answers"]
            text += "\n" + ques_text
            ques_end = offset + len(ques_text)
            question = MRCQuestion(pack, offset, ques_end)
            question.qid = qa["id"]
            offset = ques_end + 1
            for a in ans:
                ans_text = a["text"]
                ans_start = a["answer_start"]
                answer = Phrase(pack, ans_start, ans_start + len(ans_text))
                question.answers.append(answer)

        pack.set_text(text)

        passage = Passage(pack, 0, context_end)
        Document(pack, 0, len(pack.text))

        passage.passage_id = title
        pack.pack_name = title
        yield pack

[docs]    @classmethod
    def default_configs(cls):
        return {"file_ext": ".txt"}

[docs]    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of `PlainTextReader` which is
        `ft.onto.base_ontology.Document` with an empty set
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        record_meta["ft.onto.base_ontology.Document"] = set()