Commit a840df69 authored by Erxleben, Fredo's avatar Erxleben, Fredo
Browse files

Resolve "Enable DataContainer to compose a DataFrame from given IDs"

parent 1bc998e9
Pipeline #85471 passed with stages
in 3 minutes and 4 seconds
......@@ -43,6 +43,7 @@ hifis-surveyval itself. Please see the `Developer Guide
pages/getting_started
pages/preprocessing
pages/accessing_data
pages/development
pages/api
......
.. hifis-surveyval
.. Framework to help developing analysis scripts for the HIFIS Software survey.
..
.. SPDX-FileCopyrightText: 2021 HIFIS Software <support@hifis.net>
..
.. SPDX-License-Identifier: GPL-3.0-or-later
..
.. This program is free software: you can redistribute it and/or modify
.. it under the terms of the GNU General Public License as published by
.. the Free Software Foundation, either version 3 of the License, or
.. (at your option) any later version.
..
.. This program is distributed in the hope that it will be useful,
.. but WITHOUT ANY WARRANTY; without even the implied warranty of
.. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
.. GNU General Public License for more details.
..
.. You should have received a copy of the GNU General Public License
.. along with this program. If not, see <http://www.gnu.org/licenses/>.
Accessing the Data
==================
To work with the survey data, it is most often the case to that as certain
data types.
HIFIS Surveyval provides access to the data as the following types:
Pandas DataFrame
----------------
The DataContainer provides an interface for querying the data by question ID.
Wrong or non-existing IDs will be ignored, but logged.
.. code-block:: python
from hifis_surveyval.data_container import DataContainer
from hifis_surveyval.hifis_surveyval import HIFISSurveyval
def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
"""Execute example script."""
# get a pandas dataframe for one or more question collection IDs
question_collection_ids = ["Q001",
"Q002",
"Q009"]
dataframe = data.data_frame_for_ids(question_collection_ids)
hifis_surveyval.printer.print_dataframe(dataframe)
# get a pandas dataframe for all questions collections
dataframe = data.data_frame_for_ids(data.question_collection_ids)
hifis_surveyval.printer.print_dataframe(dataframe)
......@@ -102,9 +102,10 @@ function of the script. An example script is shown below.
.. code-block:: python
from hifis_surveyval.data_container import DataContainer
from hifis_surveyval.hifis_surveyval import HIFISSurveyval
def run(hifis_surveyval: HIFISSurveyval):
def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
"""Execute example script."""
for question in hifis_surveyval.survey_questions:
for question in data.question_collection_ids:
print(question)
......@@ -27,10 +27,13 @@ functions.
.. currentmodule:: hifis_surveyval.data_container
.. moduleauthor:: HIFIS Software <software@hifis.net>
"""
import logging
from logging import warning
from typing import Dict, List, Union
import pandas
from pandas import DataFrame
from hifis_surveyval.models.mixins.identifiable import Identifiable
from hifis_surveyval.models.mixins.yaml_constructable import YamlDict, YamlList
from hifis_surveyval.models.question import Question
......@@ -169,11 +172,83 @@ class DataContainer(object):
Returns:
The question for the given ID.
Raises:
KeyError - if either the collection or the question for the given
ID could not be found.
KeyError:
If either the collection or the question for the given ID
could not be found.
"""
parts: List[str] = full_id.split(Identifiable.HIERARCHY_SEPARATOR)
collection_id = parts[0]
question_id = parts[1]
collection = self.collection_for_id(collection_id)
return collection.question_for_id(question_id)
@property
def question_collection_ids(self) -> List[str]:
"""
Get the IDs of all question collections.
Returns:
A list of question collection IDs as strings.
"""
return list(self._survey_questions.keys())
def data_frame_for_ids(self, requested_ids: List[str]) -> DataFrame:
"""
Compose a Data Frame form a list of question (collection) IDs.
IDs for which no question or question collection can be found will
be skipped. These will be logged at debug level.
Args:
requested_ids:
A list of full question or question collection IDs,
which are to be composed by participant into a single data
frame.
Returns:
A single data frame containing the answers of all participants
for the given questions / question collections.
"""
frame_pieces: List[DataFrame] = []
for piece_id in requested_ids:
try:
frame_pieces.append(self._frame_for_id(piece_id))
except ValueError as error:
logging.debug(error)
continue
return pandas.concat(frame_pieces)
def _frame_for_id(self, piece_id) -> DataFrame:
"""
Obtain a data frame representation for a Question (Collection) ID.
This is a helper method used to transform either questions or
question collections into data frames based on their ID. It a
shortcut to be used in data_frame_for_ids() and not meant to be
called by the user. Use the appropriate functions of questions and
collections instead.
Args:
piece_id:
The full ID of either a question or question collection.
Returns:
A data frame matching the answers given per participant for the
question or question collection identified by the provided ID.
Raises:
ValueError:
When no Question or QuestionCollection with the given ID
exists.
"""
try:
return self.collection_for_id(piece_id).as_data_frame()
except KeyError:
pass
try:
return DataFrame(self.question_for_id(piece_id).as_series())
except KeyError:
pass
raise ValueError(
f"{piece_id} is not a valid " f"question / collection ID"
)
......@@ -30,5 +30,15 @@ from hifis_surveyval.hifis_surveyval import HIFISSurveyval
def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
"""Execute example script."""
frame = data.get_by_id()
hifis_surveyval.printer.print_dataframe(frame)
# print all loaded question IDs
for question in data.question_collection_ids:
print(question)
# get a pandas dataframe for one or more question collection IDs
question_collection_ids = ["Q001", "Q002", "Q009"]
dataframe = data.data_frame_for_ids(question_collection_ids)
hifis_surveyval.printer.print_dataframe(dataframe)
# get a pandas dataframe for all questions collections
dataframe = data.data_frame_for_ids(data.question_collection_ids)
hifis_surveyval.printer.print_dataframe(dataframe)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment