Commit 61c5cdff authored by Erxleben, Fredo (FWCC) - 136987's avatar Erxleben, Fredo (FWCC) - 136987
Browse files

Resolve "Adapt models to changed metadata"

parent 454d66b2
Pipeline #75745 passed with stages
in 2 minutes and 24 seconds
......@@ -31,6 +31,9 @@ manual dependency config:
- import name: 'hifis_surveyval#noqa'
pkg name: 'hifis_surveyval'
ignore: True
- import name: 'tests#noqa'
pkg name: 'tests'
ignore: True
- import name: 'pkg_resources'
pkg name: 'pkg_resources'
ignore: True
......
......@@ -288,7 +288,6 @@ hifis_surveyval/
├── dispatch.py
├── globals.py
├── __init__.py
├── metadata.py
├── plot.py
├── question.py
└── util.py
......
......@@ -41,6 +41,7 @@ It can be used as a handy facility for running the task from a command line.
.. moduleauthor:: HIFIS Software <software@hifis.net>
"""
import logging
import pathlib
import click
import pkg_resources
......@@ -61,7 +62,7 @@ settings: Settings = Settings()
default=0,
show_default=True,
help="Enable verbose output. "
"Increase verbosity by setting this option up to 3 times.",
"Increase verbosity by setting this option up to 3 times.",
)
def cli(verbose: int) -> None:
"""
......@@ -99,7 +100,7 @@ def version() -> None:
is_flag=True,
show_default=True,
help="Create a default config as file. "
"Overwrites any existing configuration file.",
"Overwrites any existing configuration file.",
)
@click.option(
"--script",
......@@ -107,7 +108,7 @@ def version() -> None:
is_flag=True,
show_default=True,
help="Create an example script in the given script folder. "
"Overwrites any existing example script file.",
"Overwrites any existing example script file.",
)
def init(config: bool, script: bool) -> None:
"""
......@@ -130,26 +131,28 @@ def init(config: bool, script: bool) -> None:
util.create_example_script(settings)
@click.argument("file_name", type=click.File(mode="r"))
@click.argument("survey_data",
type=click.Path(
exists=True,
dir_okay=False,
path_type=pathlib.Path)
)
@cli.command()
def analyze(file_name: click.File) -> None:
def analyze(survey_data: click.Path) -> None:
"""
Read the given files into global data and metadata objects.
Read the survey data and run all defined analysis scripts.
If the data file can not be parsed by Pandas, an error will be printed and
the program will abort.
If the metadata file can not be parsed, an error will be printed and
the program will abort.
The metadata are read from a file specified in the settings.
Args:
file_name (click.File): File that contains all data for the analysis.
survey_data (click.File): File that contains all data for the analysis.
"""
settings.load_config_file()
surveyval: HIFISSurveyval = HIFISSurveyval(settings=settings)
surveyval.prepare_environment()
logging.info(f"Analyzing file {file_name.name}")
surveyval.analyze(data_file=file_name)
logging.info(f"Analyzing file {survey_data.name}")
surveyval.load_all_data(data_file=survey_data)
dispatcher: Dispatcher = Dispatcher(surveyval=surveyval)
dispatcher.discover()
......
#!/usr/bin/env python
# hifis-surveyval
# Framework to help developing analysis scripts for the HIFIS Software survey.
#
# SPDX-FileCopyrightText: 2021 HIFIS Software <support@hifis.net>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# -*- coding: utf-8 -*-
"""
This module provides the definitions for survey metadata.
Survey metadata given in a YAML file is transformed into a dictionary.
"""
import logging
from pathlib import Path
from pydoc import locate
from typing import Dict, List, Optional, Union
import numpy
import yaml
from hifis_surveyval.data_container import DataContainer
from hifis_surveyval.models.answer import Answer, AnswerType, ValidAnswerTypes
from hifis_surveyval.models.question import (
AbstractQuestion,
Question,
QuestionCollection,
)
# The YAML dictionary has a recursive type
YamlDict = Dict[str, Optional[Union[str, "YamlDict"]]]
# This would be cooler as an enum
# How to do that in an elegant way with minimal overhead?
KEYWORD_QUESTIONS: str = "questions"
KEYWORD_ANSWERS: str = "answers"
KEYWORD_ID: str = "id"
KEYWORD_TEXT: str = "text"
KEYWORD_SHORT: str = "short-text"
KEYWORD_DATATYPE: str = "datatype"
class MetaDataHandler:
"""Provides functionality to load meta data."""
def __init__(self, data_source: DataContainer) -> None:
"""
Initialize a MetaDataHandler.
Args:
data_source (DataContainer):
Data source is passed in as a dependency.
"""
self.data_source: DataContainer = data_source
self.survey_questions: Dict[str, AbstractQuestion] = {}
@classmethod
def parse_answer(
cls, content: YamlDict, question_data_type: type = str
) -> Answer:
"""
Parse an Answer object from YAML.
Args:
content (YamlDict):
The YAML representation as a dictionary.
question_data_type (type):
The data type of an answer to a question.
Returns:
Answer: A newly constructed Answer object.
"""
assert KEYWORD_ID in content
assert KEYWORD_TEXT in content
answer_id: str = content[KEYWORD_ID]
answer_text: str = content[KEYWORD_TEXT]
answer_short_text: Optional[str] = (
content[KEYWORD_SHORT] if KEYWORD_SHORT in content else None
)
return Answer(
answer_id, answer_text, answer_short_text, question_data_type
)
def parse_question(
self, content: YamlDict, collection_id: Optional[str] = None
) -> Question:
"""
Parse a Question object from YAML.
Args:
content (YamlDict):
The YAML representation as a dictionary.
collection_id (Optional[str]):
(Optional) If the question is part of a question collection,
this is the ID of the collection as it will be part of the
question ID. Otherwise, just default to None.
Returns:
Question:
A newly constructed Question object. It will
automatically be added to survey_questions.
Raises:
ValueError: Exception thrown if data type of question could not
be parsed.
"""
assert KEYWORD_ID in content
assert KEYWORD_TEXT in content
question_id: str = content.get(KEYWORD_ID)
if collection_id:
question_id = collection_id + "[" + question_id + "]"
question_text: str = content.get(KEYWORD_TEXT)
predefined_answers: List[Answer] = []
# Data types from metadata are given as string.
# They need to be converted to type with pydoc.locate().
# The default data type is string.
question_data_type: type
if KEYWORD_DATATYPE in content:
type_string: str = content[KEYWORD_DATATYPE]
if type_string not in ValidAnswerTypes:
# TODO is there a more robust way to create the filter string?
raise ValueError(
f"Could not parse type name '{type_string}' from metadata "
f"when constructing question {question_id}"
)
question_data_type = locate(type_string)
else:
question_data_type = str
# Check for predefined answers
if KEYWORD_ANSWERS in content and content[KEYWORD_ANSWERS]:
answer_yaml: YamlDict
for answer_yaml in content[KEYWORD_ANSWERS]:
new_answer: Answer = self.parse_answer(
answer_yaml, question_data_type
)
predefined_answers.append(new_answer)
new_question: Question = Question(
question_id, question_text, predefined_answers, question_data_type
)
logging.debug(f"Parsed question {new_question}")
# Put the newly parsed object into the global dictionary
self.survey_questions[question_id] = new_question
return new_question
def parse_question_collection(self, content: YamlDict) -> None:
"""
Parse a Question Collection object from YAML.
Args:
content (YamlDict): The YAML representation as a dictionary.
"""
# TODO handle requirements more gracefully
assert KEYWORD_ID in content
assert KEYWORD_TEXT in content
assert KEYWORD_QUESTIONS in content
collection_id: str = content.get(KEYWORD_ID)
text: str = content.get(KEYWORD_TEXT)
questions: List[Question] = []
for question_yaml in content[KEYWORD_QUESTIONS]:
questions.append(self.parse_question(question_yaml, collection_id))
assert questions
new_collection: QuestionCollection = QuestionCollection(
collection_id, text, questions
)
logging.debug(f"Parsed question collection {new_collection}")
# Put the newly parsed object into the global dictionary
self.survey_questions[collection_id] = new_collection
def construct_questions_from_metadata(
self, metadata_file: Path
) -> Dict[str, AbstractQuestion]:
"""
Load metadata from given YAML file.
Given YAML file with metadata is loaded into a dictionary.
Args:
metadata_file (Path):
Path to the metadata file.
Returns:
Dict[str, AbstractQuestion]:
Dictionary of questions parsed from the metadata file.
Raises:
IOError:
Will be raised if given YAML file could not be opened and
loaded.
ValueError:
Will be raised if the provided file does not exist.
"""
raw_metadata: YamlDict = {}
if not metadata_file.exists():
raise ValueError("Metadata file did not exist")
try:
with metadata_file.open(mode="r", encoding="utf-8") as file:
raw_metadata = yaml.load(stream=file, Loader=yaml.Loader)
except IOError:
logging.error(f"YAML file {metadata_file} could not be opened.")
raise
if len(raw_metadata) == 0:
logging.error(f"File {metadata_file} was empty.")
return {}
item: YamlDict
for item in raw_metadata:
if KEYWORD_QUESTIONS in item:
self.parse_question_collection(item)
else:
self.parse_question(item)
return self.survey_questions
def fetch_participant_answers(self) -> None:
"""
Extract the participants' answers for `survey_questions`.
The function will iterate through the raw pandas frame in the data
container and extract the per-participant answers for each question.
All answers will be stored in the survey_questions dictionary.
No data will be filtered during this operation, all will be transferred
as-is.
Note: Entries with no data tend to be represented as numpy.nan in
pandas. If the respective column holds boolean or integer values,
there is no valid representation in these data types for NaN. To
preserve clean typing in these columns, the numpy.nan will be replaced
by None.
Raises:
ValueError:
Exception thrown if data source is empty.
ValueError:
Exception thrown if answer has not been answered.
ValueError:
Exception thrown if data of an answer can not be casted to
a particular data type.
"""
if self.data_source.empty:
raise ValueError(
"Could not initialize participant answers - "
"data source was empty"
)
for question_id in self.survey_questions:
question: AbstractQuestion = self.survey_questions[question_id]
if question.has_subquestions:
continue # collections have no answers
answers: Dict[
str, AnswerType
] = self.data_source.data_for_question(question_id)
participant_id: str
answer_data: AnswerType
for (participant_id, answer_data) in answers.items():
if answer_data is None:
raise ValueError(
f"Received answer with no data "
f"for question {question_id}, "
f"participant {participant_id}"
)
# Convert the given data to their respective values given the
# Target type.
if question.data_type is bool:
try:
if answer_data is not numpy.NaN:
question.add_given_answer(
participant_id, bool(answer_data)
)
else:
# numpy.nan is not a valid bool, replace by None
question.add_given_answer(participant_id, None)
except ValueError:
logging.warning(
f"Could not parse answer to type 'bool' for "
f"question {question.id}, "
f"participant {participant_id}, "
f"answer text '{answer_data}'. "
f"Data entry ignored"
)
elif question.data_type is float:
try:
question.add_given_answer(
participant_id, float(answer_data)
)
except ValueError:
logging.warning(
f"Could not parse answer to type 'float' for "
f"question {question.id}, "
f"participant {participant_id}, "
f"answer text '{answer_data}'. "
f"Data entry ignored"
)
elif question.data_type is int:
try:
if answer_data is not numpy.NaN:
question.add_given_answer(
participant_id, int(answer_data)
)
else:
# numpy.nan is not a valid int, replace by None
question.add_given_answer(participant_id, None)
except ValueError:
logging.warning(
f"Could not parse answer to type 'int' for "
f"question {question.id}, "
f"participant {participant_id}, "
f"answer text '{answer_data}'. "
f"Data entry ignored"
)
else:
# Note: numpy.nan will be stored as "nan", thus they will
# be replaced to allow them to be distinguished from valid
# strings containing the text "nan"
# TODO: Check for hacks/workarounds that filtered "nan"
# strings
if answer_data is not numpy.NaN:
question.add_given_answer(
participant_id, str(answer_data)
)
else:
question.add_given_answer(participant_id, None)
......@@ -21,115 +21,17 @@
"""This module provides helper functions."""
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, List, Optional
from pandas import DataFrame, Series, concat
from hifis_surveyval.core.settings import Settings
from hifis_surveyval.models.answer import Answer
from hifis_surveyval.models.question import (
AbstractQuestion,
Question,
QuestionCollection,
)
def filter_and_group(
filter_question: Question, group_question: Question, **filter_args
) -> Dict[Answer, Dict[str, List[Answer]]]:
"""
Obtain filtered results grouped by the answers of a question.
Args:
filter_question (Question):
The question whose given answers are to be filtered.
group_question (Question):
The question according to whose given answers the
participants are grouped.
filter_args:
Arguments passed to filter.
Returns:
Dict[Answer, Dict[str, List[Answer]]]:
An association of answers of group_question to the
filtered answers of filter_question from these participants.
"""
grouped_answers = group_question.grouped_by_answer()
results: Dict[Answer, Dict[str, List[Answer]]] = defaultdict(dict)
for answer, participant_ids in grouped_answers.items():
filter_args["participant_id"] = participant_ids
results[answer] = filter_question.filter_given_answers(**filter_args)
return results
# TODO this can be a member of QuestionCollection itself
def get_free_text_subquestion(
question: QuestionCollection, free_text_question_id: str = "other"
) -> Question:
"""
Get the sub-question of QuestionCollection that asks for free text answers.
Args:
question QuestionCollection):
QuestionCollection, in which the sub-question for free text
answers is searched.
free_text_question_id (str):
ID of a question that is of type free text.
Returns:
Question:
A sub-question that asks for custom free text answers.
"""
assert (
question.has_subquestions
), "QuestionCollection should have subquestions, but didn't"
return next(
(
subquestion
for subquestion in question.subquestions
if subquestion.id == f"{question.id}[{free_text_question_id}]"
),
None,
)
def get_given_free_text_answers(
abstract_question: AbstractQuestion,
) -> Dict[str, Answer]:
"""
Obtain valid free text answers of a Question.
Args:
abstract_question (AbstractQuestion):
A Question or QuestionCollection whose free text answers are to
be determined.
Returns:
Dict[str, Answer]:
An association of participant IDs to the free text answers from
these participants. Only participants for which free text answers
were found are included in the results.
"""
if isinstance(abstract_question, QuestionCollection):
question = get_free_text_subquestion(abstract_question)
elif isinstance(abstract_question, Question):
question = abstract_question
else:
return {}
return {
# it is assumed that only one free text answer is given to a question
participant_id: list_of_answers[0]
for participant_id, list_of_answers in question.given_answers.items()
if list_of_answers[0].text != "nan"
}
def dataframe_value_counts(
dataframe: DataFrame, relative_values: bool = False, drop_nans: bool = True
dataframe: DataFrame, relative_values: bool = False,
drop_nans: bool = True,
) -> DataFrame:
"""
Count how often a unique value appears in each column of a data frame.
......@@ -193,7 +95,7 @@ def cross_reference_sum(data: DataFrame, grouping: Series) -> DataFrame:
the grouping series.
"""
grouping_values: List[Any] = grouping.unique()
grouping_header: str = grouping.name
grouping_header: str = str(grouping.name)
# Join the frame and the series for association and clean N/A values
# Rows that can not be associated get dropped, they will not contribute to
......@@ -222,10 +124,10 @@ def cross_reference_sum(data: DataFrame, grouping: Series) -> DataFrame:
# TODO Remove filter and group for Questions?
def filter_and_group_series(
base_data: Series,
group_by: Series,
min_value: Optional[float] = None,
max_value: Optional[float] = None,
base_data: Series,
group_by: Series,
min_value: Optional[float] = None,
max_value: Optional[float] = None,
) -> DataFrame:
"""
Filter a series and group its values according to another series.
......@@ -292,5 +194,5 @@ def create_example_script(settings: Settings) -> None:
# copy a file from the packages file payload to the set up scripts folder
shutil.copy(
f"{Path(__file__).parent.parent.absolute()}/files/example_script.py",
settings.SCRIPT_FOLDER,
settings.SCRIPT_FOLDER.resolve(),
)
......@@ -28,115 +28,149 @@ functions.
.. moduleauthor:: HIFIS Software <software@hifis.net>
"""
from typing import Dict
from logging import warning
from typing import List, Union, Dict
from pandas import DataFrame, Series
from hifis_surveyval.models.answer import AnswerType
from hifis_surveyval.models.mixins.identifiable import Identifiable
from hifis_surveyval.models.mixins.yaml_constructable import YamlDict, YamlList
from hifis_surveyval.models.question import Question
from hifis_surveyval.models.question_collection import QuestionCollection
class DataContainer(object):
"""
The data container holds the data read from the command line.