Commit bad3fa18 authored by Erxleben, Fredo (FWCC) - 136987's avatar Erxleben, Fredo (FWCC) - 136987 Committed by mdolling-gfz
Browse files

Resolve "Re-enable Access to survey data as pandas data frames"

parent 1013765a
Pipeline #77771 waiting for manual action with stages
in 2 minutes and 36 seconds
......@@ -101,6 +101,10 @@ meta/
hifis-surveyval.yml
preprocess.py
scripts/
output/
# Survey Data
*.csv
# Backups from Meld
*.orig
......@@ -86,6 +86,9 @@ Group your changes into these categories:
- Add URL of project documentation to file `pyproject.toml` for PyPi.org
([!94](https://gitlab.hzdr.de/hifis/surveys/hifis-surveyval/-/merge_requests/94)
by [Normo](https://gitlab.hzdr.de/Normo)).
- Re-enable access to data as pandas DataFrames
([!88](https://gitlab.hzdr.de/hifis/surveys/hifis-surveyval/-/merge_requests/88)
by [erxleb87](https://gitlab.hzdr.de/erxleb87)).
### Changed
- Change Makefile and GitLab CI pipeline
......
......@@ -25,10 +25,10 @@ These can be constructed from YAML through the YamlConstructable abstract
class.
"""
# alias name to avoid clash with schema.Optional
from typing import Dict, List
from typing import Optional as typing_Optional
from typing import Dict, List, Optional
from schema import Optional, Schema
import schema
from pandas import Series
from hifis_surveyval.models.answer_option import AnswerOption
from hifis_surveyval.models.answer_types import VALID_ANSWER_TYPES
......@@ -57,15 +57,15 @@ class Question(YamlConstructable, Identifiable):
token_DATA_TYPE = "datatype"
token_MANDATORY = "mandatory"
schema = Schema(
schema = schema.Schema(
{
token_ID: str,
token_LABEL: str,
token_TEXT: dict,
token_DATA_TYPE: lambda t: t in VALID_ANSWER_TYPES,
token_MANDATORY: bool,
Optional(token_ANSWER_OPTIONS, default=[]): list,
Optional(str): object, # Catchall for unsupported yaml data
schema.Optional(token_ANSWER_OPTIONS, default=[]): list,
schema.Optional(str): object, # Catchall for unsupported yaml data
}
)
......@@ -126,7 +126,7 @@ class Question(YamlConstructable, Identifiable):
# The actual answers are not part of the metadata but have to be read
# from other sources in a separate step
self._answers: Dict[str, typing_Optional[answer_type]] = {}
self._answers: Dict[str, Optional[answer_type]] = {}
def add_answer(self, participant_id: str, value: str):
"""
......@@ -173,6 +173,44 @@ class Question(YamlConstructable, Identifiable):
self._answers[participant_id] = value
@property
def answers(self) -> Dict[str, Optional[object]]: # NOTE (0) below
"""
Obtain the given answers as read from the survey data.
The answers are given as a mapping:
participant ID -> participant answer
The participant ID will be a string, while the answers may be
assumed to be of the answer_type of the Question.
If the Question is not mandatory, answers may also be None.
Returns:
The mapping from participant ID to the participant's answer for
this question.
"""
return self._answers
# (0) Sadly I found no better way to narrow down the type since I could
# not refer to self._answer_type when specifying the return type.
# Suggestions for improvement are welcome.
def as_series(self) -> Series:
"""
Obtain the answers to this question as a pandas.Series.
The series' index are the participant IDs, while data for the
indices are the respective answers.
The series will be named with the question's full ID.
Returns:
A pandas.Series representing the answers for each participant
"""
series = Series(self._answers)
series.name = self.full_id
return series
@staticmethod
def _from_yaml_dictionary(yaml: YamlDict, **kwargs) -> "Question":
"""
......
......@@ -25,7 +25,10 @@ These can be constructed from YAML through the YamlConstructable abstract
class.
"""
from typing import Dict, List
from typing import Optional as typing_Optional
from typing import Union
from pandas import DataFrame, Series, concat
from schema import Optional, Schema
from hifis_surveyval.models.mixins.identifiable import Identifiable
......@@ -107,6 +110,41 @@ class QuestionCollection(YamlConstructable, Identifiable):
"""
return self._questions[question_short_id]
def as_data_frame(
self, exclude_labels: typing_Optional[Union[str, List[str]]] = None
) -> DataFrame:
"""
Gather the answers given to each question as a data frame.
Args:
exclude_labels:
A short label or a list of short labels for questions that
are to be excluded from the data frame.
Returns:
A pandas data frame with participants in the rows and the
questions of this collection in the columns. The fields in
the data frame then contain the answer to a question for a
given participant.
"""
excluded = []
if isinstance(exclude_labels, str):
excluded.append(exclude_labels)
elif isinstance(exclude_labels, list):
excluded.extend(exclude_labels)
# Nothing to do in any other case
question_series: List[Series] = []
for (label, question) in self._questions.items():
if label in excluded:
continue
question_series.append(question.as_series())
return concat(question_series, axis=1) # Note (0)
# Note (0) The series are joined row-wise, so each participant in the
# row labels (aka indexes) is associated with each answer according to
# the question (Question ID in the column labels).
@staticmethod
def _from_yaml_dictionary(
yaml: YamlDict, **kwargs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment