Source code for src.data.datasets

import logging
import os
import tarfile
import wget
import pandas as pd
import scipy
import requests
import shutil
from src.utils.utils import check_path_exists
from src.preprocessing.file_preprocessing import name_norm

LOGGER = logging.getLogger('dataset-downloader')


[docs]def download_seqamlab_dataset(path: str = 'data/datasets/ytcelebrity'): """ Downloads the YouTube Celebrities Face Tracking and Recognition Dataset and parses a information.csv. Details about the dataset can be found here: http://seqamlab.com/youtube-celebrities-face-tracking-and-recognition-dataset/. Args: path (str): Path where the videos and information.csv should be saved. """ check_path_exists(path) url = 'http://seqamlab.com/wp-content/uploads/Data/ytcelebrity.tar' file = os.path.join(path, 'ytcelebrity.tar') LOGGER.info('Downloading Youtube Celebrities Face Tracking and Recognition Data Set') wget.download(url, file) LOGGER.info('Extracting ...') tar = tarfile.open(file) tar.extractall(path) tar.close() os.remove(os.path.join(path, 'ytcelebrity.tar')) videos = pd.Series(os.listdir(path)) information = pd.DataFrame(data={ 'file': videos, 'entities': name_norm( videos.apply(lambda x: [' '.join([k.capitalize() for k in os.path.splitext(path + '/' + x)[0].split('_')[3:5]])])) }) information = information.set_index('file') information.to_csv(path + '/information.csv')
[docs]def download_imdb_faces_dataset(path: str = 'data/datasets/imdb-faces'): """ Downloads the IMDb-Faces dataset and parses a information.csv. Details about the dataset can be found here: https://github.com/fwang91/IMDb-Face. !!! Many links are outdated. Only half of the dataset can still be downloaded. !!! Args: path (str): Path where the videos and information.csv should be saved. """ imdb_faces = pd.read_csv(os.path.join(path, 'IMDb-Face.csv')) entities = [] total_count = len(imdb_faces) for index, row in imdb_faces.iterrows(): try: entity = row['name'].replace('_', ' ') LOGGER.info(f'{index}/{total_count}: Downloading image of {entity}') wget.download(row['url'], f'./images/imdb-faces/{len(entities)}.jpg') entities.append(entity) except: LOGGER.warning(f'Could not download {row["url"]}') information = pd.DataFrame(data={ 'file': range(0, len(entities) - 1), 'entities': name_norm(entities) }) information = information.set_index('file') information.to_csv(os.path.join(path, 'information.csv'))
[docs]def download_imdb_wiki_dataset(path: str = 'data/datasets/imdb-wiki'): """ Downloads the IMDb-Wiki dataset and parses a information.csv. Details about the dataset can be found here: https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/. Args: path (str): Path where the videos and information.csv should be saved """ check_path_exists(path) LOGGER.info('Downloading imdb_meta.tar') url = 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_meta.tar' file = os.path.join(path, 'imdb_meta.tar') wget.download(url, file) LOGGER.info('Extracting imdb_meta.tar') tar = tarfile.open(file) tar.extractall(path) tar.close() LOGGER.info('Converting imdb.mat to information.csv') mat = scipy.io.loadmat(os.path.join(path, 'imdb/imdb.mat')) information = pd.DataFrame(data={ 'file': pd.Series(mat['imdb']['full_path'][0][0][0]).apply(lambda x: str(x[0])), 'entities': name_norm(mat['imdb']['name'][0][0][0]) }) information = information.set_index('file') information.to_csv(os.path.join(path, 'information.csv')) LOGGER.info('Removing unnecessary files') os.remove(os.path.join(path, 'imdb_meta.tar')) shutil.rmtree(os.path.join(path, 'imdb')) LOGGER.info('-- Dataset requires about 300GB free space --') urls = [ 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_0.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_1.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_2.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_3.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_4.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_5.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_6.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_7.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_8.tar', 'https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_9.tar' ] for part, url in enumerate(urls, start=1): LOGGER.info(f'Downloading part {part}/9') file = os.path.join(path, f'imdb_{str(part)}.tar') wget.download(url, file) LOGGER.info(f'Extracting part {part}/9') tar = tarfile.open(file) tar.extractall(path) tar.close()
[docs]def download_youtube_faces_db(path: str = 'data/datasets/youtube-faces-db', download: bool = False): """ Downloads the YouTube Faces Database and parses a information.csv. Details about the dataset can be found here: https://www.cs.tau.ac.il/~wolf/ytfaces/. Args: path (str): Path where the videos are located and the information.csv should be saved at. download (bool): Whether the dataset should be downloaded automatically or only parsed. The download can take long. """ check_path_exists(path) if download: def download_file(url): local_filename = url.split('/')[-1] with requests.get(url, stream=True, auth=('wolftau', 'wtal997')) as r: r.raise_for_status() with open(os.path.join(path, local_filename), 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return local_filename LOGGER.info('Downloading ...') download_file('http://www.cslab.openu.ac.il/download/wolftau/YouTubeFaces.tar.gz') LOGGER.info('Extracting ...') tar = tarfile.open(os.path.join(path, 'YouTubeFaces.tar.gz')) tar.extractall(path) tar.close() os.remove(os.path.join(path, 'YouTubeFaces.tar.gz')) path = os.path.join(path, 'YouTubeFaces/frame_images_DB') LOGGER.info('Removing unnecessary files ...') for file_name in os.listdir(path): if file_name.endswith('.txt'): os.remove(os.path.join(path, file_name)) videos = [] entities = [] for entity in os.listdir(path): entity_path = os.path.join(path, entity) entity = entity.replace('_', ' ') if entity.startswith('.'): continue for movie in os.listdir(entity_path): if movie.startswith('.'): continue movie_path = os.path.join(entity_path, movie) for frame in os.listdir(movie_path): if frame.startswith('.'): continue videos.append(os.path.join(movie_path, frame)) entities.append(entity) information = pd.DataFrame(data={ 'file': videos, 'original_entities': entities }) entities_df = pd.DataFrame(information.original_entities.unique(), columns=['original_entities']) entities_df['entities'] = name_norm(entities_df.original_entities.tolist()) information = information.merge(entities_df, how='left', on='original_entities') information[['file', 'entities']].to_csv(os.path.join(path, 'information.csv'))