Commit 17b59ac4 authored by mdolling-gfz's avatar mdolling-gfz Committed by Erxleben, Fredo
Browse files

Resolve "additional required features for preprocessing"

parent a840df69
Pipeline #86144 passed with stage
in 1 minute and 39 seconds
......@@ -34,14 +34,39 @@ You can create an empty preprocessing script with the following command:
hifis-surveyval init -p
Within the preprocessing script, you need to have a function called
`preprocessing`, which receives a `DataContainer` Object as argument and is
expected to return a `DataContainer` Object.
``preprocessing(…)``, which receives a ``DataContainer`` Object as argument
and is expected to return a ``DataContainer``-instance.
The data you receive is the parsed raw data from csv. You can do whatever
The data you receive is the parsed raw data from your survey's CSV, enriched with the metadata you provided.
The data you receive is the parsed raw data from your survey's CSV, enriched with the metadata you provided.
You can do whatever you like within the script and modify the data as you wish.
Common use-cases would be to filter the data, like dropping incomplete questionnaires.
If you are concerned about reproducibility it is highly recommended to also publish the preprocessing script along with you raw data and metadata.
All analysis scripts run afterwards will receive a copy of the
`DataContainer` returned by the preprocessing script.
Cleaning Invalid Data
---------------------
A usual task in preprocessing is to remove data that is invalid for any reason.
The ``DataContainer`` keeps track of participant IDs for those participants
that submitted incomplete answer sets.
An answer set is considered incomplete if not all questions that are marked
as `mandatory` have been answered.
You can manually mark the answer sets of participants as invalid by using
the ``mark_as_invalid(…)``-method of the ``DataContainer``-instance.
This can be used to also exclude data that you found to be unfitting after
manual inspection.
Contrary, there is also a ``mark_as_valid(…)``-method to manually override
decisions made by the framework.
After you have marked the answer sets accordingly, you can call the
``remove_invalid_answers()``-method to clean up all marked answer sets
across all questions and collections.
Please note that this will change your data container irrevocably.
For fine tuning, the ``QuestionCollection``- and ``Question``-classes also
offer methods to remove answers of selected participants directly.
See also the provided example script for implementation details.
......@@ -29,7 +29,7 @@ functions.
"""
import logging
from logging import warning
from typing import Dict, List, Union
from typing import Dict, List, Set, Union
import pandas
from pandas import DataFrame
......@@ -41,7 +41,15 @@ from hifis_surveyval.models.question_collection import QuestionCollection
class DataContainer(object):
"""The data container holds the data read from the command line."""
"""
The data container holds the data read from the command line.
All data is grouped into question collections, which in turn hold the
questions.
During the loading, the DataContainer will keep track of answer sets
which contradict the validation rules set in the metadata (e.g. no
answer being given despite being mandatory.
"""
#: Name of the ID column in the Limesurvey CSV data
ID_COLUMN_NAME: str = "id"
......@@ -49,6 +57,8 @@ class DataContainer(object):
def __init__(self):
"""Set up an empty data container."""
self._survey_questions: Dict[str, QuestionCollection] = {}
self._invalid_answer_sets: Set[str] = set()
# Track participant IDs with invalid answer sets.
@property
def survey_questions(self) -> List[QuestionCollection]:
......@@ -61,6 +71,17 @@ class DataContainer(object):
"""
return list(self._survey_questions.values())
@property
def invalid_answer_sets(self) -> Set[str]:
"""
Get all participants who gave invalid answers.
Returns:
A set with the IDs of participants who had their answers marked
as invalid.
"""
return self._invalid_answer_sets
def load_metadata(self, yaml: Union[YamlList, YamlDict]) -> None:
"""
Load additional metadata from YAML data.
......@@ -182,6 +203,39 @@ class DataContainer(object):
collection = self.collection_for_id(collection_id)
return collection.question_for_id(question_id)
def remove_invalid_answer_sets(self) -> None:
"""
Remove answer sets that were marked as invalid.
The answers are removed on a per-participant basis.
"""
for collection in self._survey_questions.values():
collection.remove_answers(self._invalid_answer_sets)
def mark_answers_invalid(self, participant_ids: Set[str]) -> None:
"""
Mark the answers given by participants as invalid.
Args:
participant_ids:
The IDs of participants who gave invalid answers.
"""
self._invalid_answer_sets.update(participant_ids)
def mark_answers_valid(self, participant_ids: Set[str]) -> None:
"""
Mark the answers given by participants as valid.
NOTE: This does not restore previously removed invalid answers.
Invalid IDs are silently ignored.
Args:
participant_ids:
The IDs of participants for whom answers are to be marked as
valid.
"""
self._invalid_answer_sets.difference_update(participant_ids)
@property
def question_collection_ids(self) -> List[str]:
"""
......
......@@ -28,6 +28,34 @@ from hifis_surveyval.data_container import DataContainer
from hifis_surveyval.hifis_surveyval import HIFISSurveyval
def preprocess(data: DataContainer) -> DataContainer:
"""Preprocess raw data."""
# The IDs of the participants who gave invalid answers
# that we found after manual inspection
invalid_answer_sets = {
"participant_0",
"participant_1",
}
# The IDs of the participants who's answers we want to keep regardless
keep_answer_sets = {
"participant_2",
"participant_3",
}
# Mark answers to remove/keep
data.mark_answers_invalid(invalid_answer_sets)
data.mark_answers_valid(keep_answer_sets)
# Print our selection (just for reference)
print(data.invalid_answer_sets)
# Remove the marked answers
data.remove_invalid_answer_sets()
return data
def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
"""Execute example script."""
# print all loaded question IDs
......
......@@ -25,7 +25,7 @@ These can be constructed from YAML through the YamlConstructable abstract
class.
"""
# alias name to avoid clash with schema.Optional
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Set
import schema
from pandas import Series
......@@ -173,6 +173,18 @@ class Question(YamlConstructable, Identifiable):
self._answers[participant_id] = value
def remove_answers(self, participant_ids: Set[str]) -> None:
"""
Remove the answers by the specified participants.
Args:
participant_ids:
The IDs of the participants whose answers are to be removed.
Invalid IDs are ignored.
"""
for participant_id in participant_ids:
del self._answers[participant_id]
@property
def answers(self) -> Dict[str, Optional[object]]: # NOTE (0) below
"""
......
......@@ -24,7 +24,7 @@ This module contains classes to represent groups of survey questions.
These can be constructed from YAML through the YamlConstructable abstract
class.
"""
from typing import Dict, List
from typing import Dict, List, Set
from typing import Optional as typing_Optional
from typing import Union
......@@ -110,6 +110,18 @@ class QuestionCollection(YamlConstructable, Identifiable):
"""
return self._questions[question_short_id]
def remove_answers(self, participant_ids: Set[str]) -> None:
"""
Remove the answers by the specified participants.
Args:
participant_ids:
The IDs of the participants whose answers are to be removed.
Invalid IDs are ignored.
"""
for question in self._questions.values():
question.remove_answers(participant_ids)
def as_data_frame(
self, exclude_labels: typing_Optional[Union[str, List[str]]] = None
) -> DataFrame:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment