Source code for niftynet.utilities.util_csv

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division

import csv
import os
import sys
from difflib import SequenceMatcher

import numpy as np

from niftynet.io.misc_io import touch_folder
from niftynet.utilities.filename_matching import KeywordsMatching


[docs]def match_first_degree(name_list1, name_list2):
    """
    First immediate matching between two possible name lists (exact equality
    between one item of list1 and of list2
    :param name_list1: First list of names to match
    :param name_list2: Second list of names where to find a match
    :return init_match1:
    :return init_match2:
    :return ind_match1: Indices of second list that correspond to each given
    item of list 1 if exists (-1 otherwise)
    :return ind_match2: Indices of first list that correspond to each given
    item of list 2 if exists (-1) otherwise
    """
    if name_list1 is None or name_list2 is None:
        return None, None, None, None
    init_match1 = [''] * len(name_list1)
    init_match2 = [''] * len(name_list2)
    ind_match1 = [-1] * len(name_list1)
    ind_match2 = [-1] * len(name_list2)
    flatten_list1 = [item for sublist in name_list1 for item in sublist]
    flatten_list2 = [item for sublist in name_list2 for item in sublist]
    indflat_1 = [i for i in range(0, len(init_match1)) for item in
                 name_list1[i] if init_match1[i] == '']
    indflat_2 = [i for i in range(0, len(init_match2)) for item in
                 name_list2[i] if init_match2[i] == '']
    for i in range(0, len(name_list1)):
        for name in name_list1[i]:
            if name in flatten_list2:
                init_match1[i] = name
                ind_match1[i] = indflat_2[flatten_list2.index(name)]
                break
    for i in range(0, len(name_list2)):
        for name in name_list2[i]:
            if name in flatten_list1:
                init_match2[i] = name
                ind_match2[i] = indflat_1[flatten_list1.index(name)]
                break
    return init_match1, init_match2, ind_match1, ind_match2


def __find_max_overlap_in_list(name, list_names):
    """
    Given a name and list of names to match it to, find the maximum overlap
    existing

    :param name: string to match to any of list_names
    :param list_names: list of candidate strings
    :return match_seq: matched substring
    :return index: index of element in list_names to which the match is
    associated. Returns -1 if there is no found match
    """
    match_max = 0
    match_seq = ''
    match_orig = ''
    match_ratio = 0
    if not list_names:
        return '', -1
    for test in list_names:
        if test:
            match = SequenceMatcher(None, name, test).find_longest_match(
                0, len(name), 0, len(test))
            if match.size >= match_max \
                    and match.size / len(test) >= match_ratio:
                match_max = match.size
                match_seq = test[match.b:(match.b + match.size)]
                match_ratio = match.size / len(test)
                match_orig = test
    if match_max == 0:
        return '', -1
    other_list = [name for name in list_names
                  if match_seq in name and match_max / len(name) == match_ratio]
    if len(other_list) > 1:
        return '', -1
    return match_seq, list_names.index(match_orig)


[docs]def match_second_degree(name_list1, name_list2):
    """
    Perform the double matching between two lists of
    possible names.
    First find the direct matches, remove them from
    the ones still to match and
    match the remaining ones using the maximum overlap.
    Returns the name
    match for each list, and the index correspondences.

    More subtle matching with first direct matching and then secondary
    overlap matching between list of list of potential names
    :param name_list1:
    :param name_list2:
    :return init_match1:
    :return ind_match1: Index of corresponding match in name_list2
    :return init_match2: Matching string in list2
    :return ind_match2: Index of corresponding match in name_list1
    """
    if name_list1 is None or name_list2 is None:
        return None, None, None, None
    init_match1, init_match2, ind_match1, ind_match2 = match_first_degree(
        name_list1, name_list2)
    reduced_list1 = [names for names in name_list1
                     if init_match1[name_list1.index(names)] == '']
    reduced_list2 = [names for names in name_list2
                     if init_match2[name_list2.index(names)] == '']
    redflat_1 = [item for sublist in reduced_list1 for item in sublist]
    indflat_1 = [i for i in range(0, len(init_match1)) for item in
                 name_list1[i] if init_match1[i] == '']
    redflat_2 = [item for sublist in reduced_list2 for item in sublist]
    indflat_2 = [i for i in range(0, len(init_match2)) for item in
                 name_list2[i] if init_match2[i] == '']
    for i in range(0, len(name_list1)):
        if init_match1[i] == '':
            for n in name_list1[i]:
                init_match1[i], index = __find_max_overlap_in_list(n, redflat_2)
                if index >= 0:
                    ind_match1[i] = indflat_2[index]
    for i in range(0, len(name_list2)):
        if init_match2[i] == '':
            for n in name_list2[i]:
                init_match2[i], index = __find_max_overlap_in_list(n, redflat_1)
                if index >= 0:
                    ind_match2[i] = indflat_1[index]
    return init_match1, ind_match1


# From a list of list of names and a list of list of files that are
# associated, find the name correspondence and therefore the files associations
[docs]def join_subject_id_and_filename_list(name_list, list_files):
    """
    From the list of list of names and the list of list of files
    corresponding to each constraint find the association between a single
    name id and the different file lists
    :param name_list: list of list of names
    :param list_files: list of list of files (one list per constraint)
    :return list_combined: List per subject of name and list of files given
    by the constraints
    """
    ind_max = np.argmax([len(names) for names in name_list])
    name_max = name_list[ind_max]
    name_tot = []
    ind_tot = []
    name_max_to_use = []
    for c in range(0, len(list_files)):
        name_match, ind_match = match_second_degree(name_max, name_list[c])
        if c == ind_max:
            name_max_to_use = name_match
        name_tot.append(name_match)
        ind_tot.append(ind_match)

    list_combined = []
    for (i, name) in enumerate(name_max_to_use):
        list_temp = [name]
        # To do : Taking care of the case when the list of a constraint is
        # completely empty
        for c in range(0, len(list_files)):
            output = list_files[c][ind_tot[c][i]] if ind_tot[c][i] > -1 else ''
            list_temp.append(output)
        list_combined.append(list_temp)
    return list_combined


[docs]def remove_duplicated_names(name_list):
    """
    From a list of list of names remove the items that are duplicated
    :param name_list: list of list of names to investigate
    :return duplicates_removed: list of list of names freed of duplicates
    """
    flattened_list = [item for sublist in name_list for item in sublist]
    list_duplicated = [item for item in flattened_list
                       if flattened_list.count(item) > 1]
    duplicates_removed = []
    for names in name_list:
        duplicates_removed.append([name for name in names
                                   if name not in list_duplicated])
    return duplicates_removed


[docs]def write_csv(csv_file, list_combined):
    # csv writer has different behaviour in python 2/3
    if sys.version_info[0] >= 3:
        with open(csv_file, 'w', newline='', encoding='utf8') as csvfile:
            file_writer = csv.writer(csvfile)
            for list_temp in list_combined:
                file_writer.writerow(list_temp)
    else:
        with open(csv_file, 'wb') as csvfile:
            file_writer = csv.writer(csvfile, delimiter=',')
            for list_temp in list_combined:
                file_writer.writerow(list_temp)
    return


[docs]def match_and_write_filenames_to_csv(list_constraints, csv_file):
    """
    Combine all elements of file searching until finally writing the names
    :param list_constraints: list of constraints (defined by list of paths to
    search, list of elements the filename should contain and of those that
    are forbidden
    :param csv_file: file on which to write the final list of files.
    :return:
    """
    name_tot = []
    list_tot = []
    if list_constraints is None or len(list_constraints) == 0:
        return
    for c in list_constraints:
        list_files, name_list = \
            KeywordsMatching.matching_subjects_and_filenames(c)
        name_list = remove_duplicated_names(name_list)
        name_tot.append(name_list)
        list_tot.append(list_files)
    list_combined = join_subject_id_and_filename_list(name_tot, list_tot)
    list_combined = filter(lambda names: '' not in names, list_combined)
    list_combined = list(list_combined)
    if not list_combined:
        raise IOError('Nothing to write to {}'.format(csv_file))
    touch_folder(os.path.dirname(csv_file))
    write_csv(csv_file, list_combined)

    return list_combined