Source code for ms3.parse

import sys, os
import traceback
import pathos.multiprocessing as mp
from collections import Counter, defaultdict
from collections.abc import Collection

import pandas as pd

from .logger import get_logger
from .score import Score
from .utils import group_id_tuples, load_tsv, make_id_tuples, metadata2series, no_collections_no_booleans, pretty_dict, resolve_dir, scan_directory

[docs]class Parse: """ Class for storing and manipulating the information from multiple parses (i.e. :py:class:`~ms3.score.Score` objects). """ def __init__(self, dir=None, key=None, index=None, file_re=r"\.mscx$", folder_re='.*', exclude_re=r"^(\.|__)", recursive=True, logger_name='Parse', level=None): """ Parameters ---------- dir, key, index, file_re, folder_re, exclude_re, recursive : optional Arguments for the method :py:meth:`~ms3.parse.add_folder`. If ``dir`` is not passed, no files are added to the new object. logger_name : :obj:`str`, optional If you have defined a logger, pass its name. Otherwise, the logger 'Parse' is used. level : {'W', 'D', 'I', 'E', 'C', 'WARNING', 'DEBUG', 'INFO', 'ERROR', 'CRITICAL'}, optional Pass a level name for which (and above which) you want to see log records. """ self.logger = get_logger(logger_name, level) # defaultdicts with keys as keys, each holding a list with file information (therefore accessed via [key][i] ) self.full_paths, self.rel_paths, self.scan_paths, self.paths, self.files, self.fnames, self.fexts = defaultdict( list), defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list) # dicts that have IDs as keys and are therefor accessed via [(key, i)] self._parsed_mscx, self._annotations, self._notelists, self._restlists, self._noterestlists = {}, {}, {}, {}, {} self._eventlists, self._labellists, self._chordlists, self._expandedlists, self._index = {}, {}, {}, {}, {} self._measurelists, self._parsed_tsv = {}, {} # dict with keys as keys, holding the names of the index levels (which have to be the same for all corresponding IDs) self._levelnames = {} self._lists = { 'notes': self._notelists, 'rests': self._restlists, 'notes_and_rests': self._noterestlists, 'measures': self._measurelists, 'events': self._eventlists, 'labels': self._labellists, 'chords': self._chordlists, 'expanded': self._expandedlists, } self.matches = pd.DataFrame() self.last_scanned_dir = dir if dir is not None: self.add_dir(dir=dir, key=key, index=index, file_re=file_re, folder_re=folder_re, exclude_re=exclude_re, recursive=recursive) @property def parsed(self): """ Returns an overview of the MSCX files that have already been parsed.""" return {k: score.full_paths['mscx'] for k, score in self._parsed_mscx.items()} # res = {} # for k, score in self._parsed.items(): # info = score.full_paths['mscx'] # if 'annotations' in score._annotations: # info += f" -> {score.annotations.n_labels()} labels" # res[k] = info # return res def add_detached_annotations(self, mscx_key, tsv_key, match_dict=None): pass
[docs] def add_dir(self, dir, key=None, index=None, file_re=r'\.mscx$', folder_re='.*', exclude_re=r"^(\.|__)", recursive=True): """ This function scans the directory ``dir`` for files matching the criteria. Parameters ---------- dir : :obj:`str` Directory to scan for files. key : :obj:`str`, optional | Pass a string to identify the loaded files. | By default, the relative sub-directories of ``dir`` are used as keys. For example, for files within ``dir`` itself, the key would be ``'.'``, for files in the subfolder ``scores`` it would be ``'scores'``, etc. index : element or :obj:`Collection` of {'key', 'fname', 'i', :obj:`Collection`} | Change this parameter if you want to create particular indices for multi-piece DataFrames. | The resulting index must be unique (for identification) and have as many elements as added files. | Every single element or Collection of elements ∈ {'key', 'fname', 'i', :obj:`Collection`} stands for an index level. | In other words, a single level will result in a single index and a collection of levels will result in a :obj:`~pandas.core.indexes.multi.MultiIndex`. | If you pass a Collection that does not start with one of {'key', 'fname', 'i'}, it is interpreted as an index level itself and needs to have at least as many elements as the number of added files. | The default ``None`` is equivalent to passing ``(key, i)``, i.e. a MultiIndex of IDs. | 'fname' evokes an index level made from file names. file_re folder_re exclude_re recursive Returns ------- """ dir = resolve_dir(dir) self.last_scanned_dir = dir if file_re in ['tsv', 'csv']: file_re = r"\." + file_re + '$' res = scan_directory(dir, file_re=file_re, folder_re=folder_re, exclude_re=exclude_re, recursive=recursive) ids = [self.handle_path(p, key) for p in res] selector, added_ids = zip(*[(i, x) for i, x in enumerate(ids) if x[0] is not None]) if len(added_ids) > 0: grouped_ids = group_id_tuples(ids) exts = {k: self.count_extensions(k, i) for k, i in grouped_ids.items()} self.logger.debug(f"{len(added_ids)} paths stored.\n{pretty_dict(exts, 'EXTENSIONS')}") new_index, level_names = self._treat_index_param(index, ids=added_ids, selector=selector) self._index.update(new_index) for k in grouped_ids.keys(): if k in self._levelnames: previous = self._levelnames[k] if previous != level_names: replacement_ids = [(k, i) for i in grouped_ids.values()] if None in previous: new_levels = [level for level in previous if level is not None] if len(new_levels) == 0: new_levels = None replacement_ix, new_levels = self._treat_index_param(new_levels, ids=replacement_ids) self.logger.warning(f"""The created index has different levels ({level_names}) than the index that already exists for key '{k}': {previous}. Since None stands for a custom level, an alternative index with levels {new_levels} has been created.""") else: replacement_ix, _ = self._treat_index_param(previous, ids=replacement_ids) self.logger.info(f"""The created index has different levels ({level_names}) than the index that already exists for key '{k}': {previous}. Therefore, the index for this key has been adapted.""") self._index.update(replacement_ix) else: self.logger.debug(f"Index level names match the existing ones for key '{k}.'") else: self._levelnames[k] = level_names else: self.logger.debug("No files added.")
def collect_annotations_objects_references(self, keys=None, ids=None): if ids is None: ids = list(self._iterids(keys)) updated = {} for id in ids: if id in self._parsed_mscx: score = self._parsed_mscx[id] if 'annotations' in score._annotations: updated[id] = score.annotations elif id in self._annotations: del (self._annotations[id]) self._annotations.update(updated)
[docs] def collect_lists(self, keys=None, notes=False, rests=False, notes_and_rests=False, measures=False, events=False, labels=False, chords=False, expanded=False, only_new=True): """ Extracts DataFrames from the parsed scores in ``keys`` and stores them in dictionaries. Parameters ---------- keys notes rests notes_and_rests measures events labels chords expanded only_new : :obj:`bool`, optional Set to True to also retrieve lists that have already been retrieved. Returns ------- None """ if len(self._parsed_mscx) == 0: self.logger.error("No scores have been parsed so far. Use parse_mscx()") return keys = self._treat_key_param(keys) scores = {id: score for id, score in self._parsed_mscx.items() if id[0] in keys} ids = list(scores.keys()) bool_params = list(self._lists.keys()) l = locals() params = {p: l[p] for p in bool_params} for i, score in scores.items(): for param, li in self._lists.items(): if params[param] and (i not in li or not only_new): df = score.mscx.__getattribute__(param) if df is not None: li[i] = df
[docs] def count_annotation_layers(self, keys=None, per_key=False, detached=False): """ Returns a dict {key: Counter} or just a Counter. Parameters ---------- keys : :obj:`str` or :obj:`Collection`, defaults to None Key(s) for which to count annotation layers. per_key : :obj:`bool`, optional If set to True, the results are returned as a dict {key: Counter}, otherwise the counts are summed up in one Counter. detached : :obj:`bool`, optional Set to True in order to count layers in annotations that are currently not attached to a score. Returns ------- :obj:`dict` or :obj:`collections.Counter` """ res_dict = defaultdict(Counter) if detached: for id in self._iterids(keys): if id in self._parsed_mscx: for key, annotations in self._parsed_mscx[id]._annotations.items(): if key != 'annotations': _, layers = annotations.annotation_layers res_dict[key].update(layers.to_dict()) else: for key, i in self._iterids(keys): if (key, i) in self._annotations: _, layers = self._annotations[(key, i)].annotation_layers res_dict[key].update(layers.to_dict()) def make_series(counts): if len(counts) == 0: return pd.Series() data = counts.values() ix = pd.Index(counts.keys(), names=['staff', 'voice', 'label_type']) return pd.Series(data, ix) if per_key: res = {k: make_series(v) for k, v in res_dict.items()} else: res = make_series(sum(res_dict.values(), Counter())) if len(res) == 0: self.logger.info("No annotations found. Maybe no scores have been parsed using parse_mscx()?") return res
[docs] def count_extensions(self, keys=None, per_key=False): """ Returns a dict {key: Counter} or just a Counter. Parameters ---------- keys : :obj:`str` or :obj:`Collection`, defaults to None Key(s) for which to count file extensions. per_key : :obj:`bool`, optional If set to True, the results are returned as a dict {key: Counter}, otherwise the counts are summed up in one Counter. Returns ------- :obj:`dict` or :obj:`collections.Counter` """ keys = self._treat_key_param(keys) res_dict = {} for key in keys: res_dict[key] = Counter(self.fexts[key]) if per_key: return {k: dict(v) for k, v in res_dict.items()} return dict(sum(res_dict.values(), Counter()))
def count_label_types(self, keys=None, per_key=False): keys = self._treat_key_param(keys) annotated = [id for id in self._iterids(keys) if id in self._annotations] res_dict = defaultdict(Counter) for key, i in annotated: res_dict[key].update(self._annotations[(key, i)].label_types) if len(res_dict) == 0: if len(self._parsed_mscx) == 0: self.logger.error("No scores have been parsed so far. Use parse_mscx().") else: self.logger.info("None of the scores contain annotations.") if per_key: return {k: dict(v) for k, v in res_dict.items()} return dict(sum(res_dict.values(), Counter())) def detach_labels(self, keys=None, annotation_key='detached', staff=None, voice=None, label_type=None, delete=True): assert annotation_key != 'annotations', "The key 'annotations' is reserved, please choose a different one." ids = [id for id in self._iterids(keys) if id in self._annotations] prev_logger = self.logger for id in ids: score = self._parsed_mscx[id] self.logger = score.logger try: score.detach_labels(key=annotation_key, staff=staff, voice=voice, label_type=label_type, delete=delete) except: self.logger.error(f"Detaching labels failed with the following error:\n{sys.exc_info()[1]}") finally: self.logger = prev_logger self.collect_annotations_objects_references(ids=ids) def get_labels(self, keys=None, staff=None, voice=None, label_type=None, positioning=True, decode=False): if len(self._parsed_mscx) == 0: self.logger.error("No scores have been parsed so far. Use parse_mscx()") return pd.DataFrame() keys = self._treat_key_param(keys) label_type = self._treat_label_type_param(label_type) self.collect_lists(labels=True, only_new=True) l = locals() params = {p: l[p] for p in ['staff', 'voice', 'label_type', 'positioning', 'decode']} ids = [id for id in self._iterids(keys) if id in self._annotations] annotation_tables = [self._annotations[id].get_labels(**params, warnings=False) for id in ids] idx, names = self.ids2idx(ids) names += tuple(annotation_tables[0].index.names) return pd.concat(annotation_tables, keys=idx, names=names) def get_lists(self, keys=None, notes=False, rests=False, notes_and_rests=False, measures=False, events=False, labels=False, chords=False, expanded=False): if len(self._parsed_mscx) == 0: self.logger.error("No scores have been parsed so far. Use parse_mscx()") return {} keys = self._treat_key_param(keys) bool_params = list(self._lists.keys()) l = locals() params = {p: l[p] for p in bool_params} self.collect_lists(keys, only_new=True, **params) ids = list(self._iterids(keys)) res = {} for param, li in self._lists.items(): if params[param]: for id in ids: if id in li: res[id + (param,)] = li[id] return res def handle_path(self, full_path, key=None): full_path = resolve_dir(full_path) if os.path.isfile(full_path): file_path, file = os.path.split(full_path) file_name, file_ext = os.path.splitext(file) rel_path = os.path.relpath(file_path, self.last_scanned_dir) if key is None: key = rel_path if file in self.files[key]: same_name = [i for i, f in enumerate(self.files[key]) if f == file] if any(True for i in same_name if self.rel_paths[key][i] == rel_path): self.logger.error(f"""The file name {file} is already registered for key '{key}' and both files have the relative path {rel_path}. Load one of the identically named files with a different key using add_dir(key='KEY').""") return (None, None) self.logger.debug(f"The file {file} is already registered for key '{key}' but can be distinguished via the relative path.") self.scan_paths[key].append(self.last_scanned_dir) self.rel_paths[key].append(rel_path) self.full_paths[key].append(full_path) self.paths[key].append(file_path) self.files[key].append(file) self.fnames[key].append(file_name) self.fexts[key].append(file_ext) return key, len(self.paths[key]) - 1 else: self.logger.error("No file found at this path: " + full_path) return (None, None) def ids2idx(self, ids, pandas_index=False): idx = [self._index[id] for id in ids] levels = [len(ix) for ix in idx] error = False if not all(l == levels[0] for l in levels[1:]): self.logger.warning( f"Could not create index because the index values have different numbers of levels: {set(levels)}") idx = ids error = True if error: names = ['key', 'i'] else: grouped_ids = group_id_tuples(ids) level_names = {k: self._levelnames[k] for k in grouped_ids} if len(set(level_names.keys())) > 1: self.logger.warning( f"Could not set level names because they differ for the different keys:\n{pretty_dict(level_names, 'LEVEL_NAMES')}") names = None else: names = tuple(level_names.values())[0] if pandas_index: idx = pd.Index(idx, names=names) return idx return idx, names def index(self, keys=None, per_key=False): if per_key: keys = self._treat_key_param(keys) return {k: self.index(k) for k in keys} return [self._index[id] for id in self._iterids(keys)] def info(self, keys=None, return_str=False): ids = list(self._iterids(keys)) info = f"{len(ids)} files.\n" exts = self.count_extensions(keys, per_key=True) info += pretty_dict(exts, heading='EXTENSIONS') parsed_ids = [id for id in ids if id in self._parsed_mscx] parsed = len(parsed_ids) if parsed > 0: mscx = self.count_extensions(keys, per_key=False)['.mscx'] if parsed == mscx: info += f"\n\nAll {mscx} MSCX files have been parsed." else: info += f"\n\n{parsed}/{mscx} MSCX files have been parsed." annotated = sum(True for id in ids if id in self._annotations) info += f"\n{annotated} of them have annotations attached." if annotated > 0: layers = self.count_annotation_layers(keys, per_key=True) info += f"\n{pretty_dict(layers, heading='ANNOTATION LAYERS')}" detached = sum(True for id in parsed_ids if self._parsed_mscx[id].has_detached_annotations) if detached > 0: info += f"\n{detached} of them have detached annotations:" layers = self.count_annotation_layers(keys, per_key=True, detached=True) info += f"\n{pretty_dict(layers, heading='ANNOTATION LAYERS')}" else: info += f"\n\nNo mscx files have been parsed." if return_str: return info print(info) def metadata(self, keys=None): parsed_ids = [id for id in self._iterids(keys) if id in self._parsed_mscx] if len(parsed_ids) > 0: ids, meta_series = zip(*[(id, metadata2series(self._parsed_mscx[id].mscx.metadata)) for id in parsed_ids]) idx = self.ids2idx(ids, pandas_index=True) return pd.DataFrame(meta_series, index=idx) if len(self._parsed_mscx) == 0: self.logger.info("No scores have been parsed so far. Use parse_mscx()") return pd.DataFrame()
[docs] def parse(self, keys=None, read_only=True, level=None, parallel=True, only_new=True, fexts=None, **kwargs): """ Shorthand for executing parse_mscx and parse_tsv at a time.""" self.parse_mscx(keys=keys, read_only=read_only, level=level, parallel=parallel, only_new=only_new) self.parse_tsv(keys=keys, fexts=fexts, **kwargs)
def parse_mscx(self, keys=None, read_only=True, level=None, parallel=True, only_new=True): if parallel and not read_only: read_only = True self.logger.info("When pieces are parsed in parallel, the resulting objects are always in read_only mode.") if only_new: paths = [(key, i, self.full_paths[key][i]) for key, i in self._iterids(keys) if self.fexts[key][i] == '.mscx' and (key, i) not in self._parsed_mscx] else: paths = [(key, i, self.full_paths[key][i]) for key, i in self._iterids(keys) if self.fexts[key][i] == '.mscx'] parse_this = [(key, i, path, read_only, level) for key, i, path in paths] ids = [t[:2] for t in parse_this] if parallel: pool = mp.Pool(mp.cpu_count()) res = pool.starmap(self._parse, parse_this) pool.close() pool.join() self._parsed_mscx.update({id: score for id, score in zip(ids, res)}) else: for params in parse_this: self._parsed_mscx[params[:2]] = self._parse(*params) self.collect_annotations_objects_references(ids=ids) def parse_tsv(self, keys=None, fexts=None, **kwargs): if fexts is None: ids = [(key, i) for key, i in self._iterids(keys) if self.fexts[key][i] != '.mscx'] else: if isinstance(fexts, str): fexts = [fexts] fexts = [ext if ext[0] == '.' else f".{ext}" for ext in fexts] ids = [(key, i) for key, i in self._iterids(keys) if self.fexts[key][i] in fexts] for key, i in ids: self._parsed_tsv[(key, i)] = load_tsv(self.full_paths[key][i], **kwargs) def store_lists(self, keys=None, root_dir=None, notes_folder=None, notes_suffix='', rests_folder=None, rests_suffix='', notes_and_rests_folder=None, notes_and_rests_suffix='', measures_folder=None, measures_suffix='', events_folder=None, events_suffix='', labels_folder=None, labels_suffix='', chords_folder=None, chords_suffix='', expanded_folder=None, expanded_suffix='', simulate=False): l = locals() list_types = list(self._lists) folder_vars = [t + '_folder' for t in list_types] suffix_vars = [t + '_suffix' for t in list_types] folder_params = {t: l[p] for t, p in zip(list_types, folder_vars) if l[p] is not None} suffix_params = {t: l[p] for t, p in zip(list_types, suffix_vars) if t in folder_params} list_params = {p: True for p in folder_params.keys()} lists = self.get_lists(keys, **list_params) paths = {} for (key, i, what), li in lists.items(): new_path = self._store_tsv(df=li, key=key, i=i, folder=folder_params[what], suffix=suffix_params[what], root_dir=root_dir, what=what, simulate=simulate) if new_path in paths: modus = 'would ' if simulate else '' self.logger.warning(f"The {paths[new_path]} at {new_path} {modus}have been overwritten with {what}.") paths[new_path] = what if simulate: return list(set(paths.keys()))
[docs] def store_mscx(self, keys=None, root_dir=None, folder='.', suffix='', simulate=False): """ Stores the parsed MuseScore files in their current state, e.g. after detaching or attaching annotations. """ ids = [id for id in self._iterids(keys) if id in self._parsed_mscx] paths = [] for key, i in ids: new_path = self._store_mscx(key=key, i=i, folder=folder, suffix=suffix, root_dir=root_dir, simulate=simulate) if new_path in paths: modus = 'would ' if simulate else '' self.logger.warning(f"The score at {new_path} {modus}have been overwritten.") else: paths.append(new_path) if simulate: return list(set(paths))
def _calculate_path(self, key, i, root_dir, folder): """ Constructs a path and file name from a loaded file based on the arguments. Parameters ---------- key, i : (:obj:`str`, :obj:`int`) ID from which to construct the new path and filename. folder : :obj:`str` Where to store the file. Can be relative to ``root_dir`` or absolute, in which case ``root_dir`` is ignored. If ``folder`` is relative, the behaviour depends on whether it starts with a dot ``.`` or not: If it does, the folder is created at every end point of the relative tree structure under ``root_dir``. If it doesn't, it is created only once, relative to ``root_dir``, and the relative tree structure is build below. root_dir : :obj:`str`, optional Defaults to None, meaning that the original root directory is used that was added to the Parse object. Otherwise, pass a directory to rebuild the original substructure. If ``folder`` is an absolute path, ``root_dir`` is ignored. """ if os.path.isabs(folder) or '~' in folder: folder = resolve_dir(folder) path = folder else: root = self.scan_paths[key][i] if root_dir is None else resolve_dir(root_dir) if folder[0] == '.': path = os.path.abspath(os.path.join(root, self.rel_paths[key][i], folder)) else: path = os.path.abspath(os.path.join(root, folder, self.rel_paths[key][i])) base, _ = os.path.split(root) if path[:len(base)] != base: self.logger.error(f"Not allowed to store files above the level of root {root}.\nErroneous path: {path}") return None return path def _iterids(self, keys=None): """ Iterator through IDs for a given set of keys. Yields ------ :obj:`tuple` (str, int) """ keys = self._treat_key_param(keys) for key in sorted(keys): for id in make_id_tuples(key, len(self.fnames[key])): yield id def _itersel(self, collectio, selector, opposite=False): if selector is None: for i, e in enumerate(collectio): yield e if opposite: for i, e in enumerate(collectio): if i not in selector: yield e else: for i, e in enumerate(collectio): if i in selector: yield e def _parse(self, key, i, path=None, read_only=False, level=None): if path is None: path = self.full_paths[key][i] file = self.files[key][i] fname = self.fnames[key][i] prev_logger = self.logger.name self.logger = get_logger(f"{fname}") self.logger.debug(f"Attempting to parse {file}") try: score = Score(path, read_only=read_only, logger_name=self.logger.name, level=level) self._parsed_mscx[(key, i)] = score self.logger.info(f"Done parsing {file}") return score except (KeyboardInterrupt, SystemExit): raise except: self.logger.exception(traceback.format_exc()) return None finally: self.logger = get_logger(prev_logger) def _store_mscx(self, key, i, folder, suffix='', root_dir=None, simulate=False): """ Creates a MuseScore 3 file from the Score object at the given ID (key, i). Parameters ---------- key, i : (:obj:`str`, :obj:`int`) ID from which to construct the new path and filename. folder, root_dir : :obj:`str` Parameters passed to :py:meth:`_calculate_path`. suffix : :obj:`str`, optional Suffix to append to the original file name. simulate : :obj:`bool`, optional Set to True if no files are to be written. Returns ------- :obj:`str` Path of the stored file. """ def restore_logger(val): nonlocal prev_logger self.logger = prev_logger return val prev_logger = self.logger fname = self.fnames[key][i] self.logger = get_logger(fname) id = (key, i) if id not in self._parsed_mscx: self.logger.error(f"No Score object found. Call parse_mscx() first.") return restore_logger(None) path = self._calculate_path(key=key, i=i, root_dir=root_dir, folder=folder) if path is None: return restore_logger(None) fname = fname + suffix + '.mscx' file_path = os.path.join(path, fname) if simulate: self.logger.debug(f"Would have written score to {file_path}.") else: os.makedirs(path, exist_ok=True) self._parsed_mscx[id].store_mscx(file_path) self.logger.debug(f"Score written to {file_path}.") return restore_logger(file_path) def _store_tsv(self, df, key, i, folder, suffix='', root_dir=None, what='DataFrame', simulate=False, **kwargs): """ Stores a given DataFrame by constructing path and file name from a loaded file based on the arguments. Parameters ---------- df : :obj:`pandas.DataFrame` DataFrame to store as a TSV. key, i : (:obj:`str`, :obj:`int`) ID from which to construct the new path and filename. folder, root_dir : :obj:`str` Parameters passed to :py:meth:`_calculate_path`. suffix : :obj:`str`, optional Suffix to append to the original file name. what : :obj:`str`, optional Descriptor, what the DataFrame contains for more informative log message. simulate : :obj:`bool`, optional Set to True if no files are to be written. **kwargs: Arguments for :py:meth:`pandas.DataFrame.to_csv`. Defaults to ``{'sep': '\t', 'index': False}``. If 'sep' is changed to a different separator, the file extension(s) will be changed to '.csv' rather than '.tsv'. Returns ------- :obj:`str` Path of the stored file. """ def restore_logger(val): nonlocal prev_logger self.logger = prev_logger return val prev_logger = self.logger fname = self.fnames[key][i] self.logger = get_logger(fname + f":{what}") if df is None: self.logger.debug(f"No DataFrame for {what}.") return restore_logger(None) path = self._calculate_path(key=key, i=i, root_dir=root_dir, folder=folder) if path is None: return restore_logger(None) if 'sep' not in kwargs: kwargs['sep'] = '\t' if 'index' not in kwargs: kwargs['index'] = False ext = '.tsv' if kwargs['sep'] == '\t' else '.csv' fname = fname + suffix + ext file_path = os.path.join(path, fname) if simulate: self.logger.debug(f"Would have written {what} to {file_path}.") else: os.makedirs(path, exist_ok=True) no_collections_no_booleans(df, logger=self.logger).to_csv(file_path, **kwargs) self.logger.debug(f"{what} written to {file_path}.") return restore_logger(file_path) def _treat_index_param(self, index_param, ids, selector=None): if index_param is None: names = ('key', 'i') return {id: id for id in ids}, names if isinstance(index_param, str): index_param = [index_param] index_levels = [] is_index_level=False names = [] for i, level in enumerate(index_param): if isinstance(level, str): if level in ['key', 'fname', 'i']: new_level = self._make_index_level(level, ids=ids, selector=selector) index_levels.append(new_level) names.append(level) self.logger.debug(f"Level '{level}' generated: {new_level}") else: assert len(index_levels) == 0, f"Failed to create index level '{level}', because it is neither a keyword nor a Collection." is_index_level = True break elif isinstance(level, Collection): new_level = self._make_index_level(level, ids=ids, selector=selector) if len(new_level) > 0: index_levels.append(new_level) names.append(None) else: assert len(index_levels) == 0, f"Failed to create index level '{level}', because it is neither a keyword nor a Collection." is_index_level = True break if is_index_level: self.logger.debug(f"index_param is interpreted as a single index level rather than a collection of levels.") new_level = self._make_index_level(index_param, ids=ids, selector=selector) if len(new_level) > 0: index_levels.append(new_level) names = [None] if len(index_levels) == 0: self.logger.error(f"No index could be created.") new_index = {id: ix for id, ix in zip(ids, zip(*[tuple(v.values()) for v in index_levels]))} existing = [ix for ix in new_index if ix in self._index.keys()] counts = {k: v for k, v in Counter(new_index.values()).items() if v > 1} l_counts, l_existing = len(counts), len(existing) if l_counts > 0 or l_existing > 0: new_index = self._treat_index_param(None, ids=ids) if l_counts > 0: plural_phrase = "These values occur" if l_counts > 1 else "This value occurs" self.logger.error(f"The generated index is not unique and has been replaced by the standard index (IDs).\n{plural_phrase} several times:\n{pretty_dict(counts)}") if l_existing > 0: plural_phrase = "s are" if l_existing > 1 else " is" self.logger.error(f"The generated index cannot be used because the following element{plural_phrase} already in use:\n{existing}") return new_index, tuple(names) def _make_index_level(self, level, ids, selector=None): if level == 'key': return {id: id[0] for id in ids} if level == 'i': return {id: id[1] for id in ids} if level == 'fname': return {(key, i): self.fnames[key][i] for key, i in ids} ll, li = len(level), len(ids) ls = 0 if selector is None else len(selector) if ll < li: self.logger.error(f"Index level (length {ll}) has not enough values for {li} ids.") return {} if ll > li: if ls == 0: res = {i: l for i, l in self._itersel(zip(ids, level), tuple(range(li)))} discarded = [l for l in self._itersel(level, tuple(range(li, ll)))] self.logger.warning(f"""Index level (length {ll}) has more values than needed for {li} ids and no selector has been passed. Using the first {li} elements, discarding {discarded}""") elif ls != li: self.logger.error(f"The selector for picking elements from the overlong index level (length {ll}) should have length {li}, not {ls},") res = {} else: if ls != ll: discarded = [l for l in self._itersel(level, selector, opposite=True)] plural_s = 's' if len(discarded) > 1 else '' self.logger.debug(f"Selector {selector} was applied, leaving out the index value{plural_s} {discarded}") res = {i: l for i, l in zip(ids, self._itersel(level, selector))} else: res = {i: l for i, l in zip(ids, level)} return res def _treat_key_param(self, keys): if keys is None: keys = list(self.full_paths.keys()) elif isinstance(keys, str): keys = [keys] return keys def _treat_label_type_param(self, label_type): if label_type is None: return None all_types = {str(k): k for k in self.count_label_types().keys()} if isinstance(label_type, int) or isinstance(label_type, str): label_type = [label_type] lt = [str(t) for t in label_type] not_found = [t for t in lt if t not in all_types] if len(not_found) > 0: plural = len(not_found) > 1 plural_s = 's' if plural else '' self.logger.warning( f"No labels found with {'these' if plural else 'this'} label{plural_s} label_type{plural_s}: {', '.join(not_found)}") return [all_types[t] for t in lt if t in all_types] # def expand_labels(self, keys=None, how='dcml'): # keys = self._treat_key_param(keys) # scores = {id: score for id, score in self._parsed.items() if id[0] in keys} # res = {} # for id, score in scores.items(): # if score.mscx._annotations is not None: # exp = score.annotations.expanded # self._expandedlists[id] = exp # res[id + ('expanded',)] = exp # return res def __getitem__(self, item): if item in self.fexts: fexts = self.fexts[item] res = {} for i, ext in enumerate(fexts): id = (item, i) ix = str(self._index[id]) if ext == '.mscx': if id in self._parsed_mscx: ix += " (parsed)" val = str(self._parsed_mscx[id]) else: ix += " (not parsed)" val = self.full_paths[item][i] else: if id in self._parsed_tsv: ix += " (parsed)" val = self._parsed_tsv[id].head(5).to_string() else: ix += " (not parsed)" val = self.full_paths[item][i] ix += f"\n{'-' * len(ix)}\n\n" print(ix, val, '\n') def __repr__(self): return self.info(return_str=True)