Source code for pygacity.generate.answerset

# Author: Cameron F. Abrams, <cfa22@drexel.edu>
"""
A class for handling answer sets in pygacity
"""

import yaml
import os
from collections import UserList, UserDict
import pandas as pd
import logging

logger = logging.getLogger(__name__)


[docs]
class AnswerSet(UserDict):
    _keys = ['label', 'value', 'units', 'formatter']

    def __init__(self, data: dict = {}, serial: int = 0, serialstr: str = None):
        self.serial = serial
        self.serialstr = serialstr if serialstr is not None else str(serial)
        self.dumpname = f'answers-{self.serialstr}.yaml'
        self.first_index = None
        self.sources: dict = {}
        super().__init__(data)


[docs]
    def set_source(self, index: any, source: str):
        """Record the source filename for a question index."""
        self.sources[index] = source


    def __len__(self):
        return len(self.data)
    

[docs]
    @classmethod
    def from_yaml(cls, filename: str, delete: bool = False):
        """
        Create an AnswerSet instance by loading from a YAML file of the same format as that 
        generated by the to_yaml() method.

        Parameters
        ----------
        filename : str
            the YAML filename to load
        delete : bool, optional
            whether to delete the YAML file after loading (default is False)

        Returns
        -------
        AnswerSet
            An AnswerSet instance populated with data from the YAML file.
        """
        root, ext = os.path.splitext(filename)
        assert ext in ['.yaml', '.yml'], f'{filename} does not end in .yaml or .yml'
        tokens = root.split('-')
        assert len(tokens) == 2, f'{filename} should be of the format "answers-<serial#>.yaml"'
        serial = int(tokens[1])
        R = cls(serial=serial)
        with open(filename, 'r') as f:
            R.data = yaml.safe_load(f)
        if delete:
            os.remove(filename)
        return R

    

[docs]
    def register(self, index: any, label: str = None, value: any = None, 
                 units: str = None, formatter: str = None, group: int = None):
        """
        Register an answer entry for a particular question index.

        Parameters
        ----------
        index : any
            the question index
        label : str, optional
            the label for the answer entry
        value : any, optional
            the value of the answer entry
        units : str, optional
            the units of the answer entry
        formatter : str, optional
            a format string for displaying the value
        group : int, optional
            a group identifier for the answer entry
        """
        if not self.first_index:
            self.first_index = index
        if not index in self.data:
            self.data[index] = []
        # if value is a pint.Quantity, extract magnitude and units
        if hasattr(value, 'magnitude') and hasattr(value, 'units'):
            if units is None:
                units = f'{value.units:~P}'
            value = value.magnitude
        # if value is a numpy data type, convert to native python type
        if hasattr(value, 'item'):
            value = value.item()
        element = len(self.data[index])
        self.data[index].append(dict(  label=label,
                                       value=value,
                                       units=units,
                                       formatter=formatter,
                                       group=group))
        logger.debug(f'AnswerSet.register index={index} label={label} value={value} units={units} formatter={formatter} group={group}')
        return (index, element)

    

[docs]
    def display(self, index: any, element: int = 0):
        if isinstance(index, tuple):
            index, element = index
        """
        Returns a formatted string for a particular answer entry.
        
        Parameters
        ----------
        index : any
            the question index
        element : int, optional
            the element number within the index (default is 0)
            
            Returns
            -------
            str
            formatted string for the answer entry
        """
        D = None
        if element < len(self.data[index]):
            D = self.data[index][element]
        if D:
            fmt = D.get('formatter',None)
            val = D.get('value',None)
            label = D.get('label',None)
            units = D.get('units',None)
            vstr = ''
            if val:
                if fmt:
                    vstr = fmt.format(val)
                else:
                    vstr = str(val)
                if units:
                    vstr += f' {units}'
            if label:
                if vstr:
                    return f'{label} = {vstr}'
                else:
                    return label
        return ''



[docs]
    def to_yaml(self):
        """
        Dumps the AnswerSet to a YAML file.
        """
        raw_indices = list(self.data.keys())
        common_prefix = os.path.commonprefix([str(x) for x in raw_indices])
        logger.debug(f'AnswerSet.to_yaml common prefix: "{common_prefix}"')
        if common_prefix:
            new_D = {}
            for index, AL in self.data.items():
                new_index = str(index)[len(common_prefix):]
                new_D[new_index] = AL
            self.data = new_D
        with open(self.dumpname, 'w', encoding='utf-8') as f:
            yaml.safe_dump(self.data, f)




[docs]
class AnswerSuperSet(UserList[AnswerSet]):
    """
    A collection of AnswerSet instances with methods to convert to pandas DataFrames
    and LaTeX tables.
    """
    def __init__(self, initial: list[AnswerSet] = []):
        super().__init__(initial)
        if not self._check_congruency():
            print(f'Error: There is a lack of congruency among answer sets')
        self._make_dfs()


[docs]
    @classmethod
    def from_dumpfiles(cls, files: list[str] = [], delete: bool = False):
        """
        Create an **AnswerSuperSet** instance by loading multiple AnswerSet instances from YAML files.
        
        Parameters
        ----------
        files : list[str]
            list of YAML filenames to load
        delete : bool, optional
            whether to delete the YAML files after loading (default is False)
            
        Returns
        -------
        AnswerSuperSet
            An AnswerSuperSet instance populated with AnswerSet instances from the YAML files.
        """
        data = []
        for f in files:
            data.append(AnswerSet.from_yaml(f, delete=delete))
        return cls(initial=data)



[docs]
    def to_latex(self):
        """
        Converts the **AnswerSuperSet** to a LaTeX formatted string.

        Returns
        -------
        str
            LaTeX formatted string representing the **AnswerSuperSet**
        """
        result = ''
        for index, qdata in self.questions.items():
            df = qdata['df']
            formatters = qdata.get('formatters', None)
            logger.debug(f'AnswerSuperSet.to_latex question "{index}" with formatters: {formatters}')
            source = self.question_sources.get(index, '')
            if source:
                escaped = source.replace('_', r'\_')
                source_label = f' (\\texttt{{{escaped}}})'
            else:
                source_label = ''
            result += f'\\noindent\\textbf{{Question {index}:{source_label}}}\\\\\n'
            result += df.to_latex(formatters=formatters, index=False, longtable=True)
        return result

    
    def _check_congruency(self):
        """
        Checks that all **AnswerSet** instances in the collection have the same indices
        
        Returns
        -------
        bool
            True if all **AnswerSet** instances have the same indices, False otherwise
        """
        if len(self.data) > 0:
            indices = list(self.data[0].data.keys())
            for l in self.data[1:]:
                test_indices = list(l.data.keys())
                check = all([x==y for x,y in zip(indices,test_indices)])
                if not check:
                    return False
            for i in indices:
                ilen = len(self.data[0].data[i])
                for l in self.data[1:]:
                    test_ilen = len(l.data[i])
                    check = ilen == test_ilen
                    if not check:
                        return False
        return True
    
    def _make_dfs(self):
        """
        Constructs one pandas DataFrame per question index.
        The ``group`` field on answer entries is ignored; each question gets its
        own table regardless of grouping.
        """
        self.data.sort(key=lambda x: x.serial)
        serialstrs = [x.serialstr for x in self.data]
        pattern = self.data[0]
        self.questions = {}  # index -> {formatters, values, df, source}
        # strip any common prefix from question indices
        common_prefix = os.path.commonprefix([str(x) for x in pattern.data.keys()])
        # collect source filenames, applying same prefix stripping as question indices
        self.question_sources = {
            str(k)[len(common_prefix):]: v for k, v in pattern.sources.items()
        }
        logger.debug(f'Overall common prefix: "{common_prefix}"')
        for dataset in self.data:
            new_D = {}
            for index in dataset.data.keys():
                new_D[str(index)[len(common_prefix):]] = dataset.data[index]
            dataset.data = new_D
        # build column structure from the pattern AnswerSet
        for index, AL in pattern.data.items():
            values = {'serials': serialstrs}
            formatters = {}
            for a in AL:
                label = a.get('label', None)
                units = a.get('units', None)
                key = str(label) if label is not None else ''
                if units:
                    key += f' ({units})'
                values[key] = []
                fmt = a.get('formatter', None)
                if fmt:
                    formatters[key] = fmt
            self.questions[index] = dict(formatters=formatters, values=values, df=None)
        # fill values from all AnswerSet instances (one per serial)
        for inst in self.data:
            for index, AL in inst.data.items():
                for a in AL:
                    label = a.get('label', None)
                    units = a.get('units', None)
                    key = str(label) if label is not None else ''
                    if units:
                        key += f' ({units})'
                    self.questions[index]['values'][key].append(a['value'])
        # build DataFrames
        for index, qdata in self.questions.items():
            logger.debug(f'Building DataFrame for question {index} with values: {qdata["values"]}')
            qdata['df'] = pd.DataFrame(qdata['values'])