#!/usr/bin/env python import pandas as pd from pprint import pprint import json ######################### #### Process ENTRIES #### ######################### entriesDf = pd.read_table('entries.tsv', header=0, sep='\t') entriesDf['entry_id2'] = entriesDf['entry_id'] entriesDf.set_index('entry_id2', inplace=True) # convert certain columns to JSON: for column in ['samples', 'contacts', 'experiment_ids', 'library_preps', 'platforms']: entriesDf[column] = entriesDf[column].astype("string") entriesDf[column] = entriesDf[column].apply(json.loads) #entriesDf['team_id'] = entriesDf['team_id'].astype("string") # print one JSON file per row in input TSV # this line does not output the index columns, this is why we need a copy of entry_id above entriesDf.apply(lambda x: x.to_json( "entries/{}/entry.json".format(x.entry_id)), axis=1) # "entries/{}/entry.json".format(x.name)), axis=1) ############################# #### Process EXPERIMENTS #### ############################# experimentsDf = pd.read_table('experiments.tsv', header=0, sep='\t') experimentsDf['experiment_id2'] = experimentsDf['experiment_id'] experimentsDf.set_index('experiment_id2', inplace=True) experimentsDf = experimentsDf.fillna('') experimentsDf['description'] = 'LyRic transcript models for ' + \ experimentsDf['species'] + ' ' + experimentsDf['experiment_id'] experimentsDf = experimentsDf.drop(['file', 'entry_id'], axis=1) # convert certain columns to JSON: for column in ['platforms', 'library_preps', 'samples', 'libraries', 'software']: experimentsDf[column] = experimentsDf[column].astype("string") experimentsDf[column] = experimentsDf[column].apply(json.loads) experimentsDf['notes'] = experimentsDf['notes'].astype("string") # the following cannot use 'entry_id' to name output path and skip it in the json file, so we'll have to move the output later: experimentsDf.apply(lambda x: x.to_json( "entries/{}.experiment.json".format(x.experiment_id)), axis=1)