Source code for src.preprocessing.file_preprocessing

import logging
import wikipedia
import multiprocessing as mp

LOGGER = logging.getLogger('file-preprocessing')


[docs]def norm(name, j): """ Looks up the Wikipedia name for a dataset name. Args: name (str): The name to normalize. j (int): Helper variable to identify the origin process. Returns: name (str): Normalized name. j (int): Helper variable to identify the origin process. Number (int): 1 if the lookup succeeded, 0 otherwise. """ LOGGER.info(f'process{j} running') try: wikipedia.set_lang("en") search_list = wikipedia.search(name, results=1) if search_list: return search_list[0], j, 1 else: wikipedia.set_lang("de") search_list = wikipedia.search(name, results=1) if search_list: return search_list[0], j, 1 else: return name, j, 0 except: LOGGER.info(f'{name} error') return name, j, 0
[docs]def name_norm(name_list): """ Normalize name based on Wikipedia search. Args: name_list (list): List of names to be normalized. Returns: wiki_name (list): List of normalized names. """ def add_name(result): global wiki_name global missing_name name, j, search = result wiki_name[j] = name if not search: missing_name.append(name) global wiki_name wiki_name = [None] * len(name_list) global missing_name missing_name = [] j = 0 pool = mp.Pool(mp.cpu_count()) for name in name_list: pool.apply_async(norm, args=(name, j), callback=add_name) j = j + 1 pool.close() pool.join() LOGGER.info(f'{len(missing_name)} people cant be found in wikipedia, They are:') LOGGER.info(missing_name) return wiki_name