From 9e0765421c799039498bd8beefb622db0338014b Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 11:55:01 +0100 Subject: [PATCH 1/6] Fixes for #308 and updates to tests --- isatools/isatab.py | 21 +++--- tests/test_create_models_json.py | 1 - tests/test_create_models_study_design.py | 4 +- tests/test_isatab2json.py | 1 - tests/test_json2isatab2json_convert.py | 83 ------------------------ tests/test_mw2isa.py | 2 - 6 files changed, 10 insertions(+), 102 deletions(-) delete mode 100644 tests/test_json2isatab2json_convert.py diff --git a/isatools/isatab.py b/isatools/isatab.py index f4e51021..1af784b2 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -911,9 +911,9 @@ def write_assay_table_files(inv_obj, output_dir, write_factor_values=False): for output in [x for x in node.outputs if isinstance(x, DataFile)]: columns.append(output.label) - # columns += flatten( - # map(lambda x: get_comment_column(output.label, x), - # output.comments)) + columns += flatten( + map(lambda x: get_comment_column(output.label, x), + output.comments)) elif isinstance(node, Material): olabel = node.type @@ -982,9 +982,9 @@ def write_assay_table_files(inv_obj, output_dir, write_factor_values=False): for output in [x for x in node.outputs if isinstance(x, DataFile)]: olabel = output.label df_dict[olabel][-1] = output.filename - # for co in output.comments: - # colabel = "{0}.Comment[{1}]".format(olabel, co.name) - # df_dict[colabel][-1] = co.value + for co in output.comments: + colabel = "{0}.Comment[{1}]".format(olabel, co.name) + df_dict[colabel][-1] = co.value elif isinstance(node, Sample): olabel = "Sample Name" @@ -3515,12 +3515,9 @@ def read_tfile(tfile_path, index_col=None, factor_filter=None): tfile_fp.seek(0) log.debug("Reading file into DataFrame") tfile_fp = strip_comments(tfile_fp) - tfile_df = pd.read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, - memory_map=True, encoding='utf-8').fillna('') - tfile_df.isatab_header = header - # tfile_df = IsaTabDataFrame( - # pd.read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, - # memory_map=True, encoding='utf-8').fillna('')) + tfile_df = IsaTabDataFrame( + pd.read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, + memory_map=True, encoding='utf-8').fillna('')) if factor_filter: log.debug("Filtering DataFrame contents on Factor Value %s", factor_filter) return tfile_df[tfile_df['Factor Value[{}]'.format(factor_filter[0])] == factor_filter[1]] diff --git a/tests/test_create_models_json.py b/tests/test_create_models_json.py index f14e59bc..f7796600 100644 --- a/tests/test_create_models_json.py +++ b/tests/test_create_models_json.py @@ -150,7 +150,6 @@ def test_serialize_ms_assay_topology_modifiers(self): json.dumps(top_mods, cls=SampleAssayPlanEncoder) ) ) - print(json.dumps(top_mods, cls=SampleAssayPlanEncoder, indent=4)) self.assertTrue(expected == actual) @unittest.skip( diff --git a/tests/test_create_models_study_design.py b/tests/test_create_models_study_design.py index 3c5fbe9d..7cbdd55d 100644 --- a/tests/test_create_models_study_design.py +++ b/tests/test_create_models_study_design.py @@ -867,9 +867,7 @@ def test_study_from_2_level_factorial_plan(self): study = IsaModelObjectFactory(study_design).create_assays_from_plan() self.assertEqual(len(study.assays), 6) self.assertEqual(len(study.protocols), 4) - study.filename = 's_study.txt' - from isatools import isatab - print(isatab.dumps(Investigation(studies=[study]))) + def test_study_from_2_by_3_by_2_factorial_plan(self): factor1 = StudyFactor(name='1') diff --git a/tests/test_isatab2json.py b/tests/test_isatab2json.py index 020db674..76ec0b83 100644 --- a/tests/test_isatab2json.py +++ b/tests/test_isatab2json.py @@ -152,7 +152,6 @@ def test_isatab2json_convert_repeated_measure(self): actual_json = isatab2json.convert( os.path.join(self._tab_data_dir, test_case), validate_first=False, use_new_parser=True) - print(json.dumps(actual_json, indent=4)) with open(os.path.join(self._tmp_dir, 'isa.json'), 'w') as out_fp: json.dump(actual_json, out_fp) with open(os.path.join(self._tmp_dir, 'isa.json')) as actual_json: diff --git a/tests/test_json2isatab2json_convert.py b/tests/test_json2isatab2json_convert.py deleted file mode 100644 index c8292920..00000000 --- a/tests/test_json2isatab2json_convert.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import unittest -from isatools.convert import isatab2json, json2isatab -import shutil -import json -from isatools.tests import utils -import tempfile - - -def setUpModule(): - if not os.path.exists(utils.DATA_DIR): - raise FileNotFoundError("Could not fine test data directory in {0}. Ensure you have cloned the ISAdatasets " - "repository using " - "git clone -b tests --single-branch git@github.com:ISA-tools/ISAdatasets {0}" - .format(utils.DATA_DIR)) - - -class TestJsonIsaTabTwoWayConvert(unittest.TestCase): - - def setUp(self): - self._json_data_dir = utils.JSON_DATA_DIR - self._tmp_dir = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self._tmp_dir) - - def test_json2isatab_isatab2json_2way_convert_sample_pool(self): - test_case = 'TEST-ISA-sample-pool' - with open(os.path.join(self._json_data_dir, test_case + '.json')) as test_json: - json2isatab.convert(test_json, self._tmp_dir, validate_first=False) - test_json.seek(0) # reset pointer - expected_json = json.load(test_json) - actual_json = isatab2json.convert(self._tmp_dir, validate_first=False) - self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) - - def test_json2isatab_isatab2json_2way_convert_source_split(self): - test_case = 'TEST-ISA-source-split' - with open(os.path.join(self._json_data_dir, test_case + '.json')) as test_json: - json2isatab.convert(test_json, self._tmp_dir, validate_first=False) - test_json.seek(0) # reset pointer - expected_json = json.load(test_json) - actual_json = isatab2json.convert(self._tmp_dir, validate_first=False) - self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) - - # def test_json2isatab_isatab2json_2way_convert_bii_i_1(self): - # # FIXME: Get error in isatab2json.createUnitsCategories - # # json_item.update(self.createOntologyAnnotation(value_attributes.Unit, value_attributes.Term_Source_REF, value_attributes.Term_Accession_Number)) - # # AttributeError: 'Attrs' object has no attribute 'Term_Source_REF' - # # Are Units always OntologyAnnotations? (i.e. Unit column alway accompanied by Term Accession and - # # Term Source REF? - # test_case = 'BII-I-1' - # test_json = open(os.path.join(self._json_data_dir, test_case, test_case + '.json')) - # json2isatab.convert(test_json, self._tmp_dir) - # test_json.seek(0) # reset pointer - # expected_json = json.load(test_json) - # actual_json = isatab2json.convert(self._tmp_dir) - # self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) - # - # def test_json2isatab_isatab2json_2way_convert_bii_s_3(self): - # # FIXME: Get error in isatab2json.createUnitsCategories - # # json_item.update(self.createOntologyAnnotation(value_attributes.Unit, value_attributes.Term_Source_REF, value_attributes.Term_Accession_Number)) - # # AttributeError: 'Attrs' object has no attribute 'Term_Source_REF' - # # Are Units always OntologyAnnotations? (i.e. Unit column alway accompanied by Term Accession and - # # Term Source REF? If so, related to below bii_s_7 error - # test_case = 'BII-S-3' - # test_json = open(os.path.join(self._json_data_dir, test_case, test_case + '.json')) - # json2isatab.convert(test_json, self._tmp_dir) - # test_json.seek(0) # reset pointer - # expected_json = json.load(test_json) - # actual_json = isatab2json.convert(self._tmp_dir) - # self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) - # - # def test_json2isatab_isatab2json_2way_convert_bii_s_7(self): - # # FIXME: It reports a big diff because when doing json2isatab, if Term Accession and Term Source REF columns - # # are empty it strips them out. When going back from isatab2json, it converts as string and not - # # OntologyAnnotation since there is no extra info to be able to cast back to original - # test_case = 'BII-S-7' - # test_json = open(os.path.join(self._json_data_dir, test_case, test_case + '.json')) - # json2isatab.convert(test_json, self._tmp_dir) - # test_json.seek(0) # reset pointer - # expected_json = json.load(test_json) - # actual_json = isatab2json.convert(self._tmp_dir) - # self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) \ No newline at end of file diff --git a/tests/test_mw2isa.py b/tests/test_mw2isa.py index 767d514a..5496b311 100644 --- a/tests/test_mw2isa.py +++ b/tests/test_mw2isa.py @@ -30,8 +30,6 @@ def test_conversion(self): if success and validate: log.info("conversion successful, invoking the validator for " + study_id) with open(os.path.join(self._tmp_dir, study_id, 'i_investigation.txt')) as fp: - # print(isatab.dumps(isatab.load(fp))) - # fp.seek(0) report = isatab.validate(fp) print(report) if len(report['errors']) > 0: From 064ffed1f3ef8cd2e36a559848cdbd8d7ec32232 Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 12:18:07 +0100 Subject: [PATCH 2/6] Add missing data frame wrapper and unpin requirements to complete fix for #308 --- requirements-tests.txt | 2 +- requirements.txt | 2 +- setup.py | 2 +- tests/test_isatab.py | 16 ++++++---------- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/requirements-tests.txt b/requirements-tests.txt index b47f534b..e2c8bfec 100644 --- a/requirements-tests.txt +++ b/requirements-tests.txt @@ -1,6 +1,6 @@ numpy jsonschema -pandas==0.20.* +pandas networkx behave httpretty diff --git a/requirements.txt b/requirements.txt index 3725ea4b..4847f877 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy jsonschema -pandas==0.20.* +pandas networkx lxml requests diff --git a/setup.py b/setup.py index 84cc10c9..49e9d30f 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ install_requires=[ 'numpy', 'jsonschema', - 'pandas==0.20.*', + 'pandas', 'networkx', 'lxml', 'requests', diff --git a/tests/test_isatab.py b/tests/test_isatab.py index cad66b77..a6081b68 100644 --- a/tests/test_isatab.py +++ b/tests/test_isatab.py @@ -13,6 +13,7 @@ from isatools.model import * from isatools.tests.utils import assert_tab_content_equal from isatools.tests import utils +from isatools.isatab import IsaTabDataFrame def setUpModule(): @@ -911,8 +912,7 @@ def test_source_protocol_ref_sample(self): factory = ProcessSequenceFactory(study_protocols=[Protocol(name="sample collection")]) table_to_load = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1""" - DF = pd.read_csv(StringIO(table_to_load), sep='\t') - DF.isatab_header = ["Source Name", "Protocol REF", "Sample Name"] + DF = IsaTabDataFrame(pd.read_csv(StringIO(table_to_load), sep='\t')) so, sa, om, d, pr, _, __ = factory.create_from_df(DF) self.assertEqual(len(so), 1) self.assertEqual(len(sa), 1) @@ -925,8 +925,7 @@ def test_source_protocol_ref_sample_x2(self): table_to_load = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1 source2\tsample collection\tsample2""" - DF = pd.read_csv(StringIO(table_to_load), sep='\t') - DF.isatab_header = ["Source Name", "Protocol REF", "Sample Name"] + DF = IsaTabDataFrame(pd.read_csv(StringIO(table_to_load), sep='\t')) so, sa, om, d, pr, _, __ = factory.create_from_df(DF) self.assertEqual(len(so), 2) self.assertEqual(len(sa), 2) @@ -939,8 +938,7 @@ def test_source_protocol_ref_split_sample(self): table_to_load = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1 source1\tsample collection\tsample2""" - DF = pd.read_csv(StringIO(table_to_load), sep='\t') - DF.isatab_header = ["Source Name", "Protocol REF", "Sample Name"] + DF = IsaTabDataFrame(pd.read_csv(StringIO(table_to_load), sep='\t')) so, sa, om, d, pr, _, __ = factory.create_from_df(DF) self.assertEqual(len(so), 1) self.assertEqual(len(sa), 2) @@ -953,8 +951,7 @@ def test_source_protocol_ref_pool_sample(self): table_to_load = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1 source2\tsample collection\tsample1""" - DF = pd.read_csv(StringIO(table_to_load), sep='\t') - DF.isatab_header = ["Source Name", "Protocol REF", "Sample Name"] + DF = IsaTabDataFrame(pd.read_csv(StringIO(table_to_load), sep='\t')) so, sa, om, d, pr, _, __ = factory.create_from_df(DF) self.assertEqual(len(so), 2) self.assertEqual(len(sa), 1) @@ -969,8 +966,7 @@ def test_sample_protocol_ref_split_extract_protocol_ref_data(self): table_to_load = """Sample Name\tProtocol REF\tExtract Name\tProtocol REF\tRaw Data File sample1\textraction\te1\tscanning\td1 sample1\textraction\te2\tscanning\td2""" - DF = pd.read_csv(StringIO(table_to_load), sep='\t') - DF.isatab_header = ["Source Name", "Protocol REF", "Extract Name", "Protocol REF", "Raw Data File"] + DF = IsaTabDataFrame(pd.read_csv(StringIO(table_to_load), sep='\t')) so, sa, om, d, pr, _, __ = factory.create_from_df(DF) self.assertEqual(len(so), 0) self.assertEqual(len(sa), 1) From 739841b173543f1d36a92a054b42bd993f4b3fbd Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 13:36:17 +0100 Subject: [PATCH 3/6] Add utility function for utf8 text file opening --- isatools/convert/isatab2w4m.py | 16 +++++--- isatools/isatab.py | 67 +++++++++++++++++----------------- isatools/utils.py | 9 +++++ tox.ini | 2 +- 4 files changed, 54 insertions(+), 40 deletions(-) diff --git a/isatools/convert/isatab2w4m.py b/isatools/convert/isatab2w4m.py index b67c10d1..66ed3b2f 100644 --- a/isatools/convert/isatab2w4m.py +++ b/isatools/convert/isatab2w4m.py @@ -14,6 +14,7 @@ from isatools import isatab as ISATAB +from isatools.utils import utf8_text_file_open # original from https://github.com/workflow4metabolomics/mtbls-dwnld/blob/develop/isatab2w4m.py __author__ = 'pkrog (Pierrick Roger)' @@ -334,7 +335,7 @@ def get_investigation_file(input_dir): ################################################################ def load_investigation(investigation_file): - f = open(investigation_file, 'r') + f = utf8_text_file_open(investigation_file) investigation = ISATAB.load(f) return investigation @@ -364,8 +365,10 @@ def get_sample_names(assay_df, measures_df): def make_sample_metadata(study_df, assay_df, sample_names, normalize=True): # Normalize column names - study_df.set_axis(axis=1, labels=make_names(study_df.axes[1].tolist())) - assay_df.set_axis(axis=1, labels=make_names(assay_df.axes[1].tolist())) + study_df.set_axis( + inplace=True, axis=1, labels=make_names(study_df.axes[1].tolist())) + assay_df.set_axis( + inplace=True, axis=1, labels=make_names(assay_df.axes[1].tolist())) # Merge data frames sample_metadata = assay_df.merge(study_df, on='Sample.Name', sort=False) @@ -374,7 +377,7 @@ def make_sample_metadata(study_df, assay_df, sample_names, normalize=True): if normalize: norm_sample_names = make_names(sample_names, uniq=True) sample_metadata.insert(0, 'sample.name', norm_sample_names) - sample_metadata.set_axis(axis=1, labels=make_names( + sample_metadata.set_axis(inplace=True, axis=1, labels=make_names( sample_metadata.axes[1].tolist(), uniq=True)) return sample_metadata @@ -395,7 +398,7 @@ def make_variable_metadata(measures_df, sample_names, variable_names, # Normalize if normalize: - variable_metadata.set_axis(axis=1, labels=make_names( + variable_metadata.set_axis(inplace=True, axis=1, labels=make_names( variable_metadata.axes[1].tolist(), uniq=True)) return variable_metadata @@ -422,7 +425,8 @@ def make_matrix(measures_df, sample_names, variable_names, normalize=True): if normalize: norm_sample_names = make_names(sample_names, uniq=True) norm_sample_names.insert(0, 'variable.name') - sample_variable_matrix.set_axis(axis=1, labels=norm_sample_names) + sample_variable_matrix.set_axis( + inplace=True, axis=1, labels=norm_sample_names) return sample_variable_matrix diff --git a/isatools/isatab.py b/isatools/isatab.py index 1af784b2..eca6f555 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -30,6 +30,7 @@ from isatools import logging as isa_logging from isatools.io import isatab_configurator from isatools.model import * +from isatools.utils import utf8_text_file_open log = logging.getLogger('isatools') @@ -141,7 +142,7 @@ def __init__(self, tab_options=None, show_progressbar=None, log_level=None): def parse(self, filename): try: - with open(filename, encoding='utf-8') as unicode_file: + with utf8_text_file_open(filename) as unicode_file: ttable_reader = csv.reader( filter(lambda r: r[0] != '#', unicode_file), dialect='excel-tab') @@ -1246,7 +1247,7 @@ def _build_section_df(f): def check_utf8(fp): """Used for rule 0010""" import chardet - with open(fp.name, 'rb') as fp: + with utf8_text_file_open(fp.name) as fp: charset = chardet.detect(fp.read()) if charset['encoding'] is not 'UTF-8' and charset['encoding'] is not 'ascii': validator_warnings.append({ @@ -1526,7 +1527,7 @@ def check_table_files_read(i_df, dir_context): study_filename = study_df.iloc[0]['Study File Name'] if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8'): + with utf8_text_file_open(os.path.join(dir_context, study_filename)): pass except FileNotFoundError: validator_errors.append({ @@ -1538,7 +1539,7 @@ def check_table_files_read(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8'): + with utf8_text_file_open(os.path.join(dir_context, assay_filename)): pass except FileNotFoundError: validator_errors.append({ @@ -1555,14 +1556,14 @@ def check_table_files_load(i_df, dir_context): study_filename = study_df.iloc[0]['Study File Name'] if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as fp: load_table_checks(fp) except FileNotFoundError: pass for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as fp: load_table_checks(fp) except FileNotFoundError: pass @@ -1573,7 +1574,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context): study_filename = study_df.iloc[0]['Study File Name'] if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) study_samples = set(study_df['Sample Name']) except FileNotFoundError: @@ -1581,7 +1582,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) assay_samples = set(assay_df['Sample Name']) if not assay_samples.issubset(study_samples): @@ -1599,7 +1600,7 @@ def check_protocol_usage(i_df, dir_context): if study_filename is not '': try: protocol_refs_used = set() - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) for protocol_ref_col in [i for i in study_df.columns if i.startswith('Protocol REF')]: protocol_refs_used = protocol_refs_used.union(study_df[protocol_ref_col]) @@ -1621,7 +1622,7 @@ def check_protocol_usage(i_df, dir_context): if assay_filename is not '': try: protocol_refs_used = set() - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) for protocol_ref_col in [i for i in assay_df.columns if i.startswith('Protocol REF')]: protocol_refs_used = protocol_refs_used.union(assay_df[protocol_ref_col]) @@ -1642,7 +1643,7 @@ def check_protocol_usage(i_df, dir_context): protocol_refs_used = set() if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) for protocol_ref_col in [i for i in study_df.columns if i.startswith('Protocol REF')]: protocol_refs_used = protocol_refs_used.union(study_df[protocol_ref_col]) @@ -1651,7 +1652,7 @@ def check_protocol_usage(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) for protocol_ref_col in [i for i in assay_df.columns if i.startswith('Protocol REF')]: protocol_refs_used = protocol_refs_used.union(assay_df[protocol_ref_col]) @@ -1821,7 +1822,7 @@ def check_study_factor_usage(i_df, dir_context): if study_filename is not '': try: study_factors_used = set() - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) study_factor_ref_cols = [i for i in study_df.columns if _RX_FACTOR_VALUE.match(i)] for col in study_factor_ref_cols: @@ -1837,7 +1838,7 @@ def check_study_factor_usage(i_df, dir_context): if assay_filename is not '': try: study_factors_used = set() - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) study_factor_ref_cols = set([i for i in assay_df.columns if _RX_FACTOR_VALUE.match(i)]) for col in study_factor_ref_cols: @@ -1852,7 +1853,7 @@ def check_study_factor_usage(i_df, dir_context): study_factors_used = set() if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) study_factor_ref_cols = [i for i in study_df.columns if _RX_FACTOR_VALUE.match(i)] for col in study_factor_ref_cols: @@ -1863,7 +1864,7 @@ def check_study_factor_usage(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) study_factor_ref_cols = set([i for i in assay_df.columns if _RX_FACTOR_VALUE.match(i)]) for col in study_factor_ref_cols: @@ -1890,7 +1891,7 @@ def check_protocol_parameter_usage(i_df, dir_context): if study_filename is not '': try: protocol_parameters_used = set() - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) parameter_value_cols = [i for i in study_df.columns if _RX_PARAMETER_VALUE.match(i)] for col in parameter_value_cols: @@ -1906,7 +1907,7 @@ def check_protocol_parameter_usage(i_df, dir_context): if assay_filename is not '': try: protocol_parameters_used = set() - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) parameter_value_cols = [i for i in assay_df.columns if _RX_PARAMETER_VALUE.match(i)] for col in parameter_value_cols: @@ -1922,7 +1923,7 @@ def check_protocol_parameter_usage(i_df, dir_context): protocol_parameters_used = set() if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: study_df = load_table(s_fp) parameter_value_cols = [i for i in study_df.columns if _RX_PARAMETER_VALUE.match(i)] for col in parameter_value_cols: @@ -1933,7 +1934,7 @@ def check_protocol_parameter_usage(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: assay_df = load_table(a_fp) parameter_value_cols = [i for i in assay_df.columns if _RX_PARAMETER_VALUE.match(i)] for col in parameter_value_cols: @@ -2016,7 +2017,7 @@ def check_term_source_refs_in_assay_tables(i_df, dir_context): study_filename = study_df.iloc[0]['Study File Name'] if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: df = load_table(s_fp) columns = df.columns object_index = [i for i, x in enumerate(columns) if x.startswith('Term Source REF')] @@ -2063,7 +2064,7 @@ def check_term_source_refs_in_assay_tables(i_df, dir_context): for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()): if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: df = load_table(a_fp) columns = df.columns object_index = [i for i, x in enumerate(columns) if x.startswith('Term Source REF')] @@ -2369,7 +2370,7 @@ def check_study_assay_tables_against_config(i_df, dir_context, configs): protocol_names_and_types = dict(zip(protocol_names, protocol_types)) if study_filename is not '': try: - with open(os.path.join(dir_context, study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(dir_context, study_filename)) as s_fp: df = load_table(s_fp) config = configs[('[sample]', '')] log.info("Checking study file {} against default study table configuration...".format(study_filename)) @@ -2382,7 +2383,7 @@ def check_study_assay_tables_against_config(i_df, dir_context, configs): technology_type = assay_df['Study Assay Technology Type'].tolist()[0] if assay_filename is not '': try: - with open(os.path.join(dir_context, assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(dir_context, assay_filename)) as a_fp: df = load_table(a_fp) lowered_mt = measurement_type.lower() lowered_tt = technology_type.lower() @@ -2819,7 +2820,7 @@ def validate(fp, config_dir=default_config_dir, log_level=None): protocol_names_and_types = dict(zip(protocol_names, protocol_types)) try: log.info("Loading... {}".format(study_filename)) - with open(os.path.join(os.path.dirname(fp.name), study_filename), encoding='utf-8') as s_fp: + with utf8_text_file_open(os.path.join(os.path.dirname(fp.name), study_filename)) as s_fp: study_sample_table = load_table(s_fp) study_sample_table.filename = study_filename config = configs[('[sample]', '')] @@ -2875,7 +2876,7 @@ def validate(fp, config_dir=default_config_dir, log_level=None): else: try: log.info("Loading... {}".format(assay_filename)) - with open(os.path.join(os.path.dirname(fp.name), assay_filename), encoding='utf-8') as a_fp: + with utf8_text_file_open(os.path.join(os.path.dirname(fp.name), assay_filename)) as a_fp: assay_table = load_table(a_fp) assay_table.filename = assay_filename assay_tables.append(assay_table) @@ -2988,7 +2989,7 @@ def batch_validate(tab_dir_list): if len(i_files) != 1: log.warning("Could not find an investigation file, skipping {}".format(tab_dir)) else: - with open(i_files[0], encoding='utf-8') as fp: + with utf8_text_file_open(i_files[0]) as fp: batch_report['batch_report'].append( { "filename": fp.name, @@ -3007,16 +3008,16 @@ def dumps(isa_obj, skip_dump_tables=False, dump(isa_obj=isa_obj, output_path=tmp, skip_dump_tables=skip_dump_tables, write_factor_values_in_assay_table= write_factor_values_in_assay_table) - with open(os.path.join(tmp, 'i_investigation.txt'), encoding='utf-8') as i_fp: + with utf8_text_file_open(os.path.join(tmp, 'i_investigation.txt')) as i_fp: output += os.path.join(tmp, 'i_investigation.txt') + '\n' output += i_fp.read() for s_file in glob.iglob(os.path.join(tmp, 's_*')): - with open(s_file, encoding='utf-8') as s_fp: + with utf8_text_file_open(s_file) as s_fp: output += "--------\n" output += s_file + '\n' output += s_fp.read() for a_file in glob.iglob(os.path.join(tmp, 'a_*')): - with open(a_file, encoding='utf-8') as a_fp: + with utf8_text_file_open(a_file) as a_fp: output += "--------\n" output += a_file + '\n' output += a_fp.read() @@ -3153,7 +3154,7 @@ def get_comments_row(cols, row): if os.path.isdir(isatab_path_or_ifile): fnames = glob.glob(os.path.join(isatab_path_or_ifile, "i_*.txt")) assert len(fnames) == 1 - FP = open(fnames[0], encoding='utf-8') + FP = utf8_text_file_open(fnames[0]) elif hasattr(isatab_path_or_ifile, 'read'): FP = isatab_path_or_ifile else: @@ -3508,7 +3509,7 @@ def isatab_header(self): def read_tfile(tfile_path, index_col=None, factor_filter=None): log.debug("Opening %s", tfile_path) - with open(tfile_path, encoding='utf-8') as tfile_fp: + with utf8_text_file_open(tfile_path) as tfile_fp: log.debug("Reading file header") reader = csv.reader(tfile_fp, dialect='excel-tab') header = list(next(reader)) @@ -4229,7 +4230,7 @@ def parse_investigation(self, in_filename): 'studycontacts') isecdict = {} ssecdicts = [] - with open(in_filename, encoding='utf-8') as in_file: + with utf8_text_file_open(in_filename) as in_file: tabreader = csv.reader( filter(lambda r: r[0] != '#', in_file), dialect='excel-tab') current_section = '' diff --git a/isatools/utils.py b/isatools/utils.py index 076e26ec..05a50dec 100644 --- a/isatools/utils.py +++ b/isatools/utils.py @@ -8,6 +8,7 @@ import uuid from functools import reduce from zipfile import ZipFile +import sys from isatools import isatab @@ -866,3 +867,11 @@ def remove_unused_protocols(self): investigation, output_path=os.path.dirname(self.path), i_file_name='{filename}.fix'.format( filename=os.path.basename(self.path)), skip_dump_tables=True) + + +def utf8_text_file_open(path): + if sys.version_info[0] < 3: + fp = open(path, 'rb') + else: + fp = open(path, 'r', newline='', encoding='utf8') + return fp diff --git a/tox.ini b/tox.ini index 52c00576..6cdf7615 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py34,py35,py36 +envlist = py36 [testenv] deps=-r{toxinidir}/requirements-tests.txt From 819eb8b61106a92716d9dc8a091c8ea9619f35fb Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 13:58:06 +0100 Subject: [PATCH 4/6] Re-introduce other py3* envs for full testing --- .travis.yml | 7 ++++++- tox.ini | 6 +----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index a8fdf5cf..47d0e6ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,11 @@ +sudo: false language: python +cache: pip python: -- '3.6' + - 3.4 + - 3.5 + - 3.6 + - 3.7 before_install: - bash -x get_test_data.sh install: diff --git a/tox.ini b/tox.ini index 6cdf7615..686573cc 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,6 @@ [tox] -envlist = py36 +envlist = py34,py35,py36,py37 [testenv] deps=-r{toxinidir}/requirements-tests.txt commands=nosetests - -[travis] -python = - 3.6: py36 From c2be1084f63250ea3b969e624c8ef191d6114259 Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 14:00:43 +0100 Subject: [PATCH 5/6] Fix build CI --- .travis.yml | 3 --- tox.ini | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 47d0e6ba..9f5ad1f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,10 +2,7 @@ sudo: false language: python cache: pip python: - - 3.4 - - 3.5 - 3.6 - - 3.7 before_install: - bash -x get_test_data.sh install: diff --git a/tox.ini b/tox.ini index 686573cc..17b749f3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py34,py35,py36,py37 +envlist = py34,py35,py36 [testenv] deps=-r{toxinidir}/requirements-tests.txt From 317dbf647e0054f62a81feaa059ce326ee14b0f7 Mon Sep 17 00:00:00 2001 From: djcomlab Date: Wed, 8 Aug 2018 14:26:49 +0100 Subject: [PATCH 6/6] Update version number for release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 49e9d30f..fe69401a 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='isatools', - version='0.10.2', + version='0.10.3', packages=['isatools', 'isatools.convert', 'isatools.create',