Help on Python package (pygtftk)

The GTF class

Class declaration of GTF object.

When using gtfk a GTF object methods may return:

  • a GTF object (a representation of a GTF file).

  • a TAB object (a representation of a tabulated file).

  • a FASTA object (a representation of a fasta file).

class pygtftk.gtf_interface.GTF(input_obj=None, check_ensembl_format=True, new_data=None)

An interface to a GTF file. This object returns a GTF, TAB or FASTA object.

add_attr_column(input_file=None, new_key='new_key')

Simply add a new column of attribute/value to each line.

Parameters
  • input_file – a single column file with nrow(GTF_DATA) lines. File lines should be sorted according to the GTF_DATA object. Lines for which a new key should not be added should contain “?”.

  • new_key – name of the novel key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_path = get_example_file()[0]
>>> a_col = get_example_file(ext="csv")[0]
>>> a_gtf = GTF(a_path)
>>> a_gtf = a_gtf.add_attr_column(open(a_col), new_key='foo')
>>> a_list = a_gtf.extract_data('foo', as_list=True)
>>> assert [int(x) for x in a_list] == list(range(70))
add_attr_from_dict(feat='*', key='transcript_id', a_dict=None, new_key='new_key')

Add key/value pairs to the GTF object.

Parameters
  • feat – The comma-separated list of target feature. If None, all the features.

  • key – The name of the key used for joining (e.g ‘transcript_id’).

  • a_dict – key and values will be used to call add_attr_from_file.

  • new_key – A name for the novel key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_dict = a_gtf.nb_exons()
>>> a_gtf = a_gtf.add_attr_from_dict(feat="transcript", a_dict=a_dict, new_key="exon_nb")
>>> b_dict = a_gtf.select_by_key("feature", "transcript").extract_data("transcript_id,exon_nb", as_dict_of_values=True)
>>> assert a_dict['G0006T001'] == int(b_dict['G0006T001'])
>>> assert a_dict['G0008T001'] == int(b_dict['G0008T001'])
add_attr_from_file(feat=None, key='transcript_id', new_key='new_key', inputfile=None, has_header=False)

Add key/value pairs to the GTF object.

Parameters
  • feat – The comma-separated list of target feature. If None, all the features.

  • key – The name of the key used for joining (i.e the key corresponding to value provided in the file).

  • new_key – A name for the novel key.

  • inputfile – A two column (e.g transcript_id and new value) file.

  • has_header – Whether the file to join contains column names.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> join_file = get_example_file("simple", "join")[0]
>>> b_gtf = a_gtf.add_attr_from_file(feat="gene", key="gene_id", new_key="bla", inputfile=join_file)
>>> b_list = b_gtf.select_by_key("feature", "gene").extract_data("bla", as_list=True, no_na=True, hide_undef=True)
>>> assert b_list == ['0.2322', '0.999', '0.5555']
>>> assert b_gtf.del_attr(keys='bla').get_attr_list() == ['gene_id', 'transcript_id', 'exon_id', 'ccds_id']
add_attr_from_list(feat=None, key='transcript_id', key_value=(), new_key='new_key', new_key_value=())

Add key/value pairs to the GTF object.

Parameters
  • feat – The comma-separated list of target features. If None, all the features.

  • key – The name of the key to use for joining (e.g ‘transcript_id’).

  • key_value – The values for the key (e.g [‘tx_1’, ‘tx_2’,…,’tx_n’])

  • new_key – A name for the novel key.

  • new_key_value – A name for the novel key (e.g [‘val_tx_1’, ‘val_tx_2’,…,’val_tx_n’]).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> from pygtftk.utils import TAB
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_gtf = a_gtf.add_attr_from_list(feat="gene", key="gene_id", key_value=("G0001", "G0002"), new_key="coding_pot", new_key_value=("0.5", "0.8"))
>>> assert b_gtf.extract_data(keys="coding_pot", as_list=True, no_na=True, hide_undef=True) == ['0.5', '0.8']
>>> b_gtf = a_gtf.add_attr_from_list(feat="gene", key="gene_id", key_value=("G0002", "G0001"), new_key="coding_pot", new_key_value=("0.8", "0.5"))
>>> assert b_gtf.extract_data(keys="coding_pot", as_list=True, no_na=True, hide_undef=True) == ['0.5', '0.8']
>>> key_value = tuple(a_gtf.extract_data("transcript_id", no_na=True, as_list=True, nr=True))
>>> b=a_gtf.add_attr_from_list(None, key="transcript_id", key_value=key_value, new_key="bla", new_key_value=tuple([str(x) for x in range(len(key_value))]))
>>> assert b.del_attr(keys='bla').get_attr_list()  == ['gene_id', 'transcript_id', 'exon_id', 'ccds_id']
add_attr_from_matrix_file(feat=None, key='transcript_id', inputfile=None)

Add key/value pairs to the GTF file. Expect a matrix with row names as target keys column names as novel key and each cell as value.

Parameters
  • feat – The comma-separated list of target features. If None, all the features.

  • key – The name of the key to use for joining (i.e the key corresponding to value provided in the file).

  • inputfile – A two column (e.g transcript_id and new value) file.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> join_file = get_example_file("simple", "join_mat")[0]
>>> b_gtf = a_gtf.add_attr_from_matrix_file(feat="gene", key="gene_id", inputfile=join_file)
>>> assert b_gtf.extract_data("S1",as_list=True, no_na=True, hide_undef=True) == ['0.2322', '0.999', '0.5555']
>>> assert b_gtf.extract_data("S2",as_list=True, no_na=True, hide_undef=True) == ['0.4', '0.6', '0.7']
>>> assert b_gtf.del_attr(keys='S1,S2').get_attr_list() == ['gene_id', 'transcript_id', 'exon_id', 'ccds_id']
add_attr_to_pos(input_file=None, new_key='new_key')

Simply add a new column of attribute/value to specific lines.

Parameters
  • input_file – a two column file. First column contains the position/line (zero-based numbering). Second, the value to be added for ‘new_key’.

  • new_key – name of the novel key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_path = get_example_file()[0]
>>> a_col = get_example_file(ext="tab")[0]
>>> a_gtf = GTF(a_path)
>>> a_gtf = a_gtf.add_attr_to_pos(open(a_col), new_key='foo')
>>> a_list = a_gtf.extract_data('foo', as_list=True)
>>> assert a_list[0] == 'AAA'
>>> assert a_list[1] == 'BBB'
>>> assert a_list[2] == '?'
>>> assert a_list[3] == 'CCC'
add_exon_number(key='exon_nbr')

Add exon number to exons.

Parameters

key – The key name to use for exon number.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_gtf = a_gtf.add_exon_number()
>>> a_list = a_gtf.select_by_key("feature", "exon").select_by_key("transcript_id", "G0004T002").extract_data("exon_nbr", as_list=True)
>>> assert  a_list == ['1', '2', '3']
>>> a_list = a_gtf.select_by_key("feature", "exon").select_by_key("transcript_id", "G0003T001").extract_data("exon_nbr", as_list=True)
>>> assert  a_list == ['2', '1']
add_prefix(feat='*', key=None, txt=None, suffix=False)

Add a prefix to an attribute value.

Parameters
  • feat – The target features (default to all).

  • key – The target key.

  • txt – The string to be added as a prefix.

  • suffix – If True append txt string as a suffix.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_gtf = a_gtf.add_prefix(feat="transcript", key="gene_id", suffix=False, txt="bla")
>>> a_list = a_gtf.select_by_key("feature", "transcript").extract_data("gene_id", as_list=True, nr=True)
>>> assert all([x.startswith('bla') for x in a_list])
>>> a_gtf = a_gtf.add_prefix(feat="transcript", key="gene_id", suffix=True, txt="bla")
>>> a_list = a_gtf.select_by_key("feature", "transcript").extract_data("gene_id", as_list=True, nr=True)
>>> assert all([x.endswith('bla') for x in a_list])
convert_to_ensembl(check_gene_chr=True)

Convert a GTF file to ensembl format. This method will add gene and transcript features if not encountered in the GTF.

Parameters

check_gene_chr – By default the function raise an error if several chromosomes are associated with the same gene_id.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert len(a_gtf.select_by_key("feature", "gene,transcript", invert_match=True).convert_to_ensembl()) == len(a_gtf)
del_attr(feat='*', keys=None, force=False)

Delete an attributes from the GTF file..

Parameters
  • feat – The target feature.

  • keys – comma-separated list or list of target keys.

  • force – If True, transcript_id and gene_id can be deleted.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> from pygtftk.utils import TAB
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert a_gtf.del_attr(keys="gene_id,exon_id,ccds_id", force=True).get_attr_list() == ['transcript_id']
eval_numeric(bool_exp=None, na_omit=('.', '?'))

Test numeric values. Select lines using a boolean operation on attributes. The boolean expression must contain attributes enclosed with braces. Example: “cDNA_length > 200 and tx_genomic_length > 2000”. Note that one can refers to the name of the first columns of the gtf using e.g: start, end, score or frame.

Parameters
  • bool_exp – A boolean test.

  • na_omit – Any line for which one of the tested value is in this list wont be evaluated (e.g. (“.”, “?”)).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_gtf = a_gtf.add_attr_from_list(feat="transcript", key="transcript_id", key_value=("G0001T001","G0002T001","G0003T001","G0004T001"), new_key="test", new_key_value=("10","11","20","40"))
>>> c_list = b_gtf.select_by_key("feature","transcript").eval_numeric("test > 2").extract_data("test", as_list=True)
>>> assert c_list == ['10', '11', '20', '40']
>>> assert len(b_gtf.eval_numeric('test > 0.1e2 and start < 180 and start >= 50')) == 2
>>> assert len(b_gtf.eval_numeric('test > 0.1e2 and start < 180 and start > 50')) == 1
>>> assert len(b_gtf.eval_numeric('test > 0.1e2')) == 3
>>> a_file = get_example_file(datasetname="mini_real", ext="gtf.gz")[0]
>>> a_gtf = GTF(a_file)
>>> tx = a_gtf.select_by_key('feature','transcript')
>>> assert len(tx.select_by_key("score", "0")) == len(tx.eval_numeric("score == 0"))
>>> assert len(tx.select_by_key("start", "1001145")) == len(tx.eval_numeric("start == 1001145"))
>>> assert len(tx.select_by_key("start", "1001145", invert_match=True)) == len(tx.eval_numeric("start != 1001145"))
>>> assert len(tx.select_by_key("end", "54801291")) == len(tx.eval_numeric("end == 54801291"))
>>> assert len(tx.eval_numeric("end == 54801291 and start == 53802771")) == 3
>>> assert len(tx.eval_numeric("end == 54801291 and (start == 53802771 or start == 53806156)")) == 4
>>> assert len(a_gtf.eval_numeric("phase > 0 and phase < 2", na_omit=".").extract_data('phase', as_list=True,  nr=True)) == 1
extract_data(keys=None, as_list=False, zero_based=False, no_na=False, as_dict_of_values=False, as_dict_of_lists=False, as_dict=False, as_dict_of_merged_list=False, nr=False, hide_undef=False, as_list_of_list=False, default_val=1)

Extract attribute values into containers of various types. Note that in case dict are requested, keys are force to no_na.

Note: no_na, hide_undef, nr, zero_based are not applied in case a tab object is returned (default).

Parameters
  • keys – The name of the basic or extended attributes.

  • as_list – if true returns values for the first requested key as a list.

  • no_na – Discard ‘.’ (i.e undefined value).

  • as_dict – if true returns a dict (where the key corresponds to values for the first requested key and values are set to 1). Note that ‘.’ won’t be set as a key (no need to use no_na).

  • as_dict_of_values – if true returns a dict (where the key corresponds to values for the first requested key and the value to value encountered for the second requested key).

  • as_dict_of_lists – if true returns a dict (where the key corresponds to values for the first requested key and the value to the last encountered list of other elements).

  • as_dict_of_merged_list – if true returns a dict (where the key corresponds to values for the first requested key and the value to the list of other elements encountered throughout any line).

  • as_list_of_list – if True returns a list for each line.

  • nr – in case a as_list or as_list_of_list is chosen, return a non redondant list.

  • hide_undef – if hide_undef, then records for which the key does not exists (value = “?”) are discarded.

  • zero_based – If set to True, the start position will be start-1.

  • default_val – The default value for the dict if as_dict is chosen.

Note: hide_undef and no_na are not supported when a TAB object is returned (default).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_tab = a_gtf.extract_data("gene_id,transcript_id")
>>> assert a_tab.ncols == 2
>>> assert len(a_tab) == 70
>>> assert len(a_gtf.extract_data("transcript_id", nr=False, as_list=True,no_na=False, hide_undef=False)) == 70
>>> assert len(a_gtf.extract_data("transcript_id", nr=False, as_list=True,no_na=False, hide_undef=True)) == 60
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start")) == 15
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start",as_list=True))
>>> assert a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start", as_list=True, nr=True) == ['chr1']
>>> assert a_gtf.select_by_key("feature", "transcript").extract_data("bla", as_list=True, nr=False, hide_undef=False).count('?') == 15
>>> assert a_gtf.select_by_key("feature", "transcript").extract_data("bla", as_list=True, nr=True, hide_undef=False).count('?') == 1
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("start", as_dict=True)) == 11
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid", as_dict=True)) == 1
>>> assert [len(x) for x in a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start", as_list_of_list=True)].count(2) == 15
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start", as_list_of_list=True, nr=True)) == 11
extract_data_iter_list(keys=None, zero_based=False, nr=False)

Extract attributes of any type and returns an iterator of lists.

Parameters
  • keys – The name of the basic or extended attributes.

  • zero_based – If set to True, the start position will be start-1.

  • nr – if True, return a list with non redundant items.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> gen = a_gtf.extract_data_iter_list("start,gene_id")
>>> assert next(gen) == ['125', 'G0001']
>>> gen =a_gtf.extract_data_iter_list("start,gene_id", zero_based=True)
>>> assert next(gen) == ['124', 'G0001']
>>> gen = a_gtf.extract_data_iter_list("start,gene_id")
>>> assert len([x for x in gen]) == 70
>>> gen = a_gtf.extract_data_iter_list("start,gene_id", nr=True)
>>> assert len([x for x in gen]) == 26
get_3p_end(feat_type='transcript', name='transcript_id', sep='|', as_dict=False, more_name=(), feature_name=None, explicit=False)

Returns a Bedtool object containing the 3’ coordinates of selected features (Bed6 format) or a dict (zero-based coordinate).

Parameters
  • feat_type – The feature type.

  • name – The key that should be used to computed the ‘name’ column.

  • more_name – Additional text to add to the name (a list).

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • as_dict – return the result as a dictionnary.

  • feature_name – A feature name to be added to the 4th column.

  • explicit – Write explicitly the key name in the 4th column (e.g transcript_id=NM123|gene_name=AGENE|…).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_3p_end(feat_type="exon")
>>> assert len(a_bed) == 25
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_3p_end(feat_type="exon", name=["gene_id","exon_id"])
>>> assert len(a_bed) == 25
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_3p_end(feat_type="exon", name=["gene_id","exon_id"], explicit=True)
>>> for i in a_bed: break
>>> assert i.name == 'gene_id=G0001|exon_id=G0001T002E001'
get_5p_end(feat_type='transcript', name='transcript_id', sep='|', as_dict=False, more_name=(), one_based=False, feature_name=None, explicit=False)

Returns a Bedtool object containing the 5’ coordinates of selected features (Bed6 format) or a dict (zero-based coordinate).

Parameters
  • feat_type – The feature type.

  • name – The key that should be used to computed the ‘name’ column.

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • more_name – Additional text to add to the name (a tuple).

  • as_dict – return the result as a dictionnary.

  • one_based – if as_dict is requested, return coordinates in on-based format.

  • feature_name – A feature name to be added to the 4th column.

  • explicit – Write explicitly the key name in the 4th column (e.g transcript_id=NM123|gene_name=AGENE|…).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_5p_end(feat_type="exon")
>>> assert len(a_bed) == 25
>>> a_dict = a_gtf.select_by_key("feature", "exon").get_5p_end(feat_type="exon", name=["exon_id"], as_dict=True)
>>> assert len(a_dict) == 25
>>> a_dict = a_gtf.select_by_key("feature", "exon").get_5p_end(feat_type="exon", name=["exon_id", "transcript_id"], as_dict=True)
>>> assert len(a_dict) == 25
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_5p_end(feat_type="exon", name=["transcript_id", "gene_id"], explicit=True)
>>> for i in a_bed: break
>>> assert i.name == 'transcript_id=G0001T002|gene_id=G0001'
get_attr_list(add_basic=False, as_dict=False)

Get the list of possible attributes from a GTF file..

Parameters
  • add_basic – add basic attributes to the list. Won’t work if as_dict is True.

  • as_dict – If True, returns a dict with the attribute names as key and number of occurence as values.

Example
>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> assert "".join([x[0] for x in GTF(a_file).get_attr_list()]) == 'gtec'
get_attr_value_list(key=None, count=False)

Get the list of possible values taken by an attributes from a GTF file..

Parameters
  • key – The key/attribute name to use.

  • count – whether the number of occurences should be also returned (returns a list of list).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert len(a_gtf.get_attr_value_list(key="transcript_id")) == 15
>>> assert a_gtf.get_attr_value_list(key="transcript_id", count=True)[0] == ['G0001T001', '3']
>>> assert a_gtf.get_attr_value_list(key="transcript_id", count=True)[-1] == ['G0010T001', '3']
get_chroms(nr=False, as_dict=False)

Returns the chomosome/sequence ID from the GTF.

Parameters
  • nr – True to get a non redondant list.

  • as_dict – True to get info as a dict.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_chr = GTF(a_file).get_chroms(nr=True)
>>> assert len(a_chr) == 1
>>> a_chr = GTF(a_file).get_chroms()
>>> assert len(a_chr) == 70
get_feature_list(nr=False)

Returns the list of features.

Parameters

nr – True to get a non redondant list.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_feat = a_gtf.get_feature_list(nr=True)
>>> assert len(a_feat) == 4
>>> a_feat = a_gtf.get_feature_list(nr=False)
>>> assert len(a_feat) == 70
get_feature_size(ft_type='gene', feat_id=None)

Returns feature size as a dict.

Parameters
  • ft_type – The feature type (3rd column). Could be gene, transcript, exon, CDS…

  • feat_id – the string used as feature identifier in the GTF. Could be gene_id, transcript_id, exon_id, CDS_id… By default it is computed as ft_type + ‘_id’

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert len(a_gtf.get_feature_size("exon", "exon_id")) == 25
get_gn_ids(nr=False)

Returns all gene ids from the GTF file

Parameters

nr – True to get a non redondant list.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> gn_ids = a_gtf.get_gn_ids()
>>> assert len(gn_ids) == 70
>>> gn_ids = a_gtf.get_gn_ids(nr=True)
>>> assert len(gn_ids) == 10
>>> a_gtf = GTF(a_file).select_by_key("gene_id","G0001,G0002")
>>> gn_ids = a_gtf.get_gn_ids(nr=True)
>>> assert len(gn_ids) == 2
get_gn_strand()

Returns a dict with gene IDs as keys and strands as values.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> strands = a_gtf.get_gn_strand()
>>> assert strands['G0001'] == '+'
>>> assert strands['G0001'] == '+'
>>> assert strands['G0005'] == '-'
>>> assert strands['G0006'] == '-'
get_gn_to_tx(ordered_5p=False, as_dict_of_dict=False)

Returns a dict with genes as keys and the list of their associated transcripts as values.

params ordered_5p: If True, returns transcript list ordered based on their TSS coordinates (5’ to 3’). For ties, transcript name are randomly picked. params as_dict_of_dict: If True, returns a dict of dict that maps gene_id to transcript_id and transcript_id to TSS numbering (1 for most 5’, then 2…). For transcripts having the same TSSs, the tss number will be the same.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file(datasetname="mini_real",ext="gtf.gz")[0]
>>> a_gtf = GTF(a_file)
>>> gn_2_tx = a_gtf.get_gn_to_tx(ordered_5p=True)
>>> assert gn_2_tx['ENSG00000148337'][0:3] == ['ENST00000372948', 'ENST00000634901', 'ENST00000634501']
>>> assert gn_2_tx['ENSG00000148339'][0:3]  == ['ENST00000373068', 'ENST00000373069', 'ENST00000373066']
>>> assert gn_2_tx['ENSG00000148339'][5:]  == ['ENST00000472769', 'ENST00000445012', 'ENST00000466983']
>>> gn_2_tx = a_gtf.get_gn_to_tx(as_dict_of_dict=True)
>>> assert gn_2_tx['ENSG00000153885']['ENST00000587658'] == 1
>>> assert gn_2_tx['ENSG00000153885']['ENST00000587559'] == 2
>>> assert gn_2_tx['ENSG00000153885']['ENST00000430256'] == 8
>>> assert gn_2_tx['ENSG00000153885']['ENST00000284006'] == gn_2_tx['ENSG00000153885']['ENST00000589786']
>>> assert gn_2_tx['ENSG00000164587']['ENST00000407193'] == 1
>>> assert gn_2_tx['ENSG00000164587']['ENST00000401695'] == 2
>>> assert gn_2_tx['ENSG00000164587']['ENST00000519690'] == 3
>>> assert gn_2_tx['ENSG00000105483']['ENST00000517510'] == gn_2_tx['ENSG00000105483']['ENST00000520007']
get_gname_to_tx()

Returns a dict with gene names as keys and the list of their associated transcripts as values.

get_intergenic(chrom_file=None, upstream=0, downstream=0, chr_list=None, feature_name=None)

Returns a bedtools object containing the intergenic regions.

Parameters
  • chrom_file – file object pointing to a tabulated two-columns file. Chromosomes as column 1 and their sizes as column 2 (default: None)

  • upstream – Extend transcript coordinates in 5’ direction.

  • downstream – Extend transcript coordinates in 3’ direction.

  • chr_list – a list of chromosome to restrict intergenic regions.

  • feature_name – A feature name to be added to the 4th column.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> chr_info_path = get_example_file(ext="chromInfo")[0]
>>> chr_info_file = open(chr_info_path, "r")
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.get_intergenic(chrom_file=chr_info_file)
>>> assert len(a_bed) == 10
get_introns(by_transcript=False, name=('transcript_id', 'gene_id'), sep='|', intron_nb_in_name=False, feat_name=False, feat_name_last=False)

Returns a bedtools object containing the intronic regions.

Parameters
  • by_transcript – If false (default), merged genic regions with no exonic overlap are returned. Otherwise, the intronic regions corresponding to each transcript are returned (may contain exonic overlap).

  • name – The list of ids that should be part of the 4th column of the bed file (if by_transcript is chosen).

  • sep – The separator to be used for the name (if by_transcript is chosen).

  • intron_nb_in_name – by default the intron number is added to the score column. If selected, write this information in the ‘name’ column of the bed file.

  • feat_name – add the feature name (‘intron’) in the name column.

  • feat_name_last – put feat_name in last position (but before intron_nb_in_name).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.get_introns()
>>> assert len(a_bed) == 7
get_midpoints(name=('transcript_id', 'gene_id'), sep='|')

Returns a bedtools object containing the midpoints of features.

Parameters
  • name – The key that should be used to computed the ‘name’ column.

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id’)”.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.select_by_key("feature", "exon").get_midpoints()
>>> assert len(a_bed) == 25
get_sequences(genome=None, intron=False, rev_comp=True)

Returns a FASTA object from which various sequences can be retrieved.

param genome

The genome in fasta file.

param intron

Set to True to get also intronic regions. Required for some methods of the FASTA object.

param rev_comp

By default sequence for genes on minus strand are reversed-complemented. Set to False to avoid default behaviour.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file("simple", "gtf")[0]
>>> a_gtf = GTF(a_file)
>>> genome_fa = get_example_file("simple", "fa")[0]
>>> # No intron, reverse-complement
>>> a_fa = a_gtf.get_sequences(genome_fa)
>>> assert len(a_fa) == 15
>>> a_dict = dict()
>>> for i in a_fa: a_dict[i.header] = i.sequence
>>> assert a_dict['>G0001_NA_G0001T001_chr1:125-138_+_mRNA'] == 'cccccgttacgtag'
>>> assert a_dict['>G0008_NA_G0008T001_chr1:210-222_-_RC_mRNA'] ==  'catgcgct'
>>> assert a_dict['>G0004_NA_G0004T001_chr1:65-76_+_mRNA'] == 'atctggcg'
>>> # No intron, no reverse-complement
>>> a_fa = a_gtf.get_sequences(genome_fa, rev_comp=False)
>>> a_dict = dict()
>>> for i in a_fa: a_dict[i.header] = i.sequence
>>> assert a_dict['>G0001_NA_G0001T001_chr1:125-138_+_mRNA'] == 'cccccgttacgtag'
>>> assert a_dict['>G0008_NA_G0008T001_chr1:210-222_-_mRNA'] ==  'agcgcatg'
>>> assert a_dict['>G0004_NA_G0004T001_chr1:65-76_+_mRNA'] == 'atctggcg'
>>> # Intron and Reverse-complement
>>> a_fa = a_gtf.get_sequences(genome_fa, intron=True)
>>> a_dict = dict()
>>> for i in a_fa: a_dict[i.header] = i.sequence
>>> assert a_dict['>G0008_NA_G0008T001_chr1:210-222_-_RC'] == 'catatggtgcgct'
>>> assert a_dict['>G0004_NA_G0004T001_chr1:65-76_+'] == 'atctcaggggcg'
>>> # Intron and no Reverse-complement
>>> a_fa = a_gtf.get_sequences(genome_fa, intron=True, rev_comp=False)
>>> a_dict = dict()
>>> for i in a_fa: a_dict[i.header] = i.sequence
>>> assert a_dict['>G0008_NA_G0008T001_chr1:210-222_-'] == 'agcgcaccatatg'
>>> assert a_dict['>G0004_NA_G0004T001_chr1:65-76_+'] == 'atctcaggggcg'
get_transcript_size(with_intron=False)

Returns a dict with transcript_id as keys and their mature RNA length (i.e exon only) of each transcript.

Parameters

with_intron – If True, add intronic regions.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_path = get_example_file()[0]
>>> gtf = GTF(a_path)
>>> size_dict = gtf.get_transcript_size()
>>> assert(size_dict["G0008T001"] == 8)
>>> size_dict = gtf.get_transcript_size(with_intron=True)
>>> assert(size_dict["G0008T001"] == 13)
get_tss(name='transcript_id', sep='|', as_dict=False, more_name=(), one_based=False, feature_name=None, explicit=False)

Returns a Bedtool object containing the TSSs (Bed6 format) or a dict (zero-based coordinate).

Parameters
  • name – The key that should be used to computed the ‘name’ column.

  • more_name – Additional text to add to the name (a tuple).

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • as_dict – return the result as a dictionnary.

  • one_based – if as_dict is requested, return coordinates in on-based format.

  • feature_name – A feature name to be added to the 4th column.

  • explicit – Write explicitly the key name in the 4th column (e.g transcript_id=NM123|gene_name=AGENE|…).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.get_tss()
>>> assert len(a_bed) == 15
get_tts(name='transcript_id', sep='|', as_dict=False, more_name=(), feature_name=None, explicit=False)

Returns a Bedtool object containing the TTSs (Bed6 format) or a dict (zero-based coordinate).

Parameters
  • name – The key that should be used to computed the ‘name’ column.

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • as_dict – return the result as a dictionnary.

  • more_name – Additional text to add to the name (a list).

  • feature_name – A feature name to be added to the 4th column.

  • explicit – Write explicitly the key name in the 4th column (e.g transcript_id=NM123|gene_name=AGENE|…).

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.get_tts()
>>> assert len(a_bed) == 15
get_tx_ids(nr=False)

Returns all transcript_id from the GTF file

Parameters

nr – True to get a non redondant list.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> tx_ids = a_gtf.get_tx_ids()
>>> assert len(tx_ids) == 60
>>> tx_ids = a_gtf.get_tx_ids(nr=True)
>>> assert len(tx_ids) == 15
>>> a_gtf = GTF(a_file).select_by_key("transcript_id","G0001T002,G0001T001,G0003T001")
>>> tx_ids = a_gtf.get_tx_ids(nr=True)
>>> assert len(tx_ids) == 3
get_tx_strand()

Returns a dict with transcript IDs as keys and strands as values.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> strands = a_gtf.get_tx_strand()
>>> assert strands['G0007T001'] == '+'
>>> assert strands['G0001T002'] == '+'
>>> assert strands['G0009T001'] == '-'
>>> assert strands['G0008T001'] == '-'
get_tx_to_gn()

Returns a dict with transcripts IDs as keys and gene IDs as values.

get_tx_to_gname()

Returns a dict with transcript IDs as keys and gene names as values.

head(nb=6, returned=False)

Print the nb first lines of the GTF object.

Parameters
  • nb – The number of line to display.

  • returned – If True, don’t print but returns the message.

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> res = a_gtf.head(10, returned=True)
is_defined(key=None)

Extract a numpy array representation of a GTF attribute indicating, for each line, whether the key is defined (i.e. exists). In pygtftk, values for undefined keys are ‘?’. So in other words this function tests whether each value differs from ‘?’.

Parameters

key

Returns

a numpy array of booleans.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert sum(a_gtf.is_defined("transcript_id")) == 60
>>> gn_val = ('.', '.', '.', '.', 1, 2, 3, 4, '?', '?')
>>> gn_ids = tuple(a_gtf.get_gn_ids(nr=True))
>>> b_gtf = a_gtf.add_attr_from_list(feat="gene", key="gene_id", key_value=(gn_ids), new_key="foo", new_key_value=gn_val)
>>> gn_feat = b_gtf[b_gtf.feature == 'gene']
>>> assert sum((~gn_feat.is_defined('foo') | ~gn_feat.is_set('foo'))) == 6
is_set(key=None)

Extract a numpy array representation of a GTF attribute indicating, for each line, whether the key is set (i.e. is defined with value ‘.’). In pygtftk, values for unset keys are ‘.’. So in other words this function tests whether each value for the input key are set.

Parameters

key

Returns

a numpy array of booleans.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert sum(a_gtf.is_defined("transcript_id")) == 60
merge_attr(feat='*', keys='gene_id,transcript_id', new_key='gn_tx_id', sep='|')

Merge a set of source keys (e.g gene_id and transcript_id) into a destination attribute.

Parameters
  • feat – The target features.

  • keys – The source keys.

  • new_key – The destination key.

  • sep – The separator.

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_list = a_gtf.merge_attr(feat="exon,transcript,CDS", keys="gene_id,transcript_id", new_key="merge").extract_data("merge", hide_undef=True, as_list=True, nr=True)
>>> assert a_list[0] == 'G0001|G0001T002'
message(msg='', type='INFO')

A processing message whose verbosity is adapted based on pygtftk.utils.VERBOSITY.

>>> import pygtftk.utils
>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> pygtftk.utils.VERBOSITY = 0
>>> pygtftk.utils.VERBOSITY = 2
>>> a_gtf.message('bla')
>>> pygtftk.utils.VERBOSITY = 3
>>> a_gtf.message('bla')
>>> pygtftk.utils.VERBOSITY = 0
nb_exons()

Return a dict with transcript as key and number of exon as value.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> nb_ex = a_gtf.nb_exons()
>>> assert len(nb_ex) == 15
>>> assert nb_ex['G0006T001'] == 3
>>> assert nb_ex['G0004T002'] == 3
>>> assert nb_ex['G0004T002'] == 3
>>> assert nb_ex['G0005T001'] == 2
>>> a_file = get_example_file("simple_04")[0]
>>> a_gtf = GTF(a_file)
>>> nb_ex = a_gtf.nb_exons()
>>> assert nb_ex['G0004T001'] == 4
nrow()

Get the number of lines enclosed in the GTF object.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert a_gtf.nrow() == 70
>>> assert a_gtf.select_by_key("feature","gene").nrow() == 10
>>> assert a_gtf.select_by_key("feature","transcript").nrow() == 15
>>> assert a_gtf.select_by_key("feature","exon").nrow() == 25
>>> assert a_gtf.select_by_key("feature","CDS").nrow() == 20
select_5p_transcript()

Select the most 5’ transcript of each gene. Note that the genomic boundaries for each gene could differ from that displayed by its associated transcripts. The gene features may be subsequently deleted using select_by_key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
select_by_key(key=None, value=None, invert_match=False, file_with_values=None, no_na=True, col=1)

Select lines from a GTF based i) on attributes and values 2) on feature type.

Parameters
  • key – The key/attribute name to use for selection.

  • value – Value for that key/attribute.

  • invert_match – Boolean. Select lines that do not match that key and value.

  • file_with_values – A file containing values (one column).

  • col – The column number (one-based) that contains the values in the file.

  • no_na – If invert_match is True then returns only record for which the value is defined.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_gtf = a_gtf.select_by_key("gene_id", "G0004")
>>> assert len(b_gtf) == 13
>>> b_gtf = a_gtf.select_by_key("transcript_id", "G0004T002")
>>> assert len(b_gtf) == 7
>>> b_gtf = a_gtf.select_by_key("feature", "transcript")
>>> assert len(b_gtf) == 15
>>> b_gtf = a_gtf.select_by_key("feature", "gene")
>>> assert len(b_gtf) == 10
>>> assert len(a_gtf.select_by_key("feature", "exon")) == 25
select_by_loc(chr_str, start_str, end_str)

Select lines from a GTF based on genomic location.

Parameters
  • chr_str – A comma-separated list of chromosomes.

  • start_str – A comma-separated list of starts.

  • end_str – A comma-separated list of ends.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b = a_gtf.select_by_loc("chr1", "125", "138")
>>> assert len(b) == 7
select_by_max_exon_nb()

For each gene select the transcript with the highest number of exons. If ties, select the first encountered.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file("simple_04")[0]
>>> a_gtf = GTF(a_file)
>>> b = a_gtf.select_by_max_exon_nb()
>>> l = b.extract_data("transcript_id", as_list=True, nr=True, no_na=True, hide_undef=True)
>>> assert "G0005T001" not in l
>>> assert "G0004T002" not in l
>>> assert "G0006T002" not in l
>>> assert len(l) == 10
select_by_number_of_exons(min=0, max=None)

Select transcript from a GTF based on the number of exons.

Parameters
  • min – Minimum number of exons.

  • max – Maximum number of exons.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b = a_gtf.select_by_number_of_exons(3,3).select_by_key("feature","transcript")
>>> assert len(b) == 3
select_by_positions(pos=None)

Select a set of lines by position. Numbering zero-based.

Parameters

pos

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> assert a_gtf.select_by_positions([0]).extract_data("feature", as_list=True) == ['gene']
>>> assert a_gtf.select_by_positions(list(range(3))).extract_data("feature", as_list=True) == ['gene', 'transcript', 'exon']
>>> assert a_gtf.select_by_positions([3,4,1]).select_by_positions([0,1]).extract_data("feature", as_list=True) == ['CDS', 'transcript']
select_by_regexp(key=None, regexp=None, invert_match=False)

Returns lines for which key value match a regular expression.

Parameters
  • key – The key/attribute name to use for selection.

  • regexp – the regular expression

  • invert_match – Select lines are those that do not match.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_list = a_gtf.select_by_regexp("transcript_id", "^G00[012]+T00[12]$").select_by_key("feature","transcript").extract_data("transcript_id", as_list=True)
>>> assert b_list == ['G0001T002', 'G0001T001', 'G0002T001', 'G0010T001']
>>> assert len(a_gtf.select_by_regexp("transcript_id", "^G00\d+T00\d+$"))==60
>>> assert len(a_gtf.select_by_regexp("transcript_id", "^G00\d+T00.*2$").get_tx_ids(nr=True))==5
select_by_transcript_size(min=0, max=1000000000)

Select ‘mature/spliced transcript by size.

Parameters
  • min – Minimum size for the feature.

  • max – Maximum size for the feature.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> tx_kept = a_gtf.select_by_transcript_size(min=5, max=10)
>>> assert len(tx_kept) == 45
select_longuest_transcripts()

Select the longuest transcript of each gene. The “gene” rows are returned in the results. Note that the genomic boundaries for each gene could differ from that displayed by its associated transcripts. The gene features may be subsequently deleted using select_by_key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
select_shortest_transcripts()

Select the shortest transcript of each gene. Note that the genomic boundaries for each gene could differ from that displayed by its associated transcripts. The gene features may be subsequently deleted using select_by_key.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
tail(nb=6, returned=False)

Print the nb last lines of the GTF object.

Parameters
  • nb – The number of line to display.

  • returned – If True, don’t print but returns the message.

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> res = a_gtf.tail(10, returned=True)
to_bed(name='gene_id', sep='|', add_feature_type=False, more_name=None)

Returns a Bedtool object (Bed6 format).

Parameters
  • name – The keys that should be used to computed the ‘name’ column (a tuple).

  • more_name – Additional text to add to the name (a list).

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • add_feature_type – Add the feature type to the name.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_bo = a_gtf.select_by_key("feature", "transcript").to_bed()
>>> assert a_bo.field_count() == 6
>>> assert len(a_bo) == 15
>>> for i in a_bo: pass
>>> a_bo = a_gtf.select_by_key("feature", "transcript").to_bed(name=('gene_id', 'transcript_id'))
>>> for i in a_bo: pass
>>> a_bo = a_gtf.select_by_key("feature", "transcript").to_bed(name=('gene_id', 'transcript_id'), sep="--")
>>> for i in a_bo: pass
write(output, add_chr=0, gc_off=False)

write the gtf to a file.

Parameters
  • output – A file object where the GTF has to be written.

  • add_chr – whether to add ‘chr’ prefix to seqid before printing.

  • gc_off – If True, shutdown the garbage collector before printing to avoid passing through free_gtf_data which can be time-consuming.

Example

>>> from pygtftk.utils import simple_line_count
>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_file = make_tmp_file()
>>> a_gtf.write(b_file)
>>> assert simple_line_count(b_file) == 70
write_bed(outputfile=None, name='gene_id', sep='|', add_chr=0, more_name=None)

Write a GTF file in BED format. This function uses a direct call to the C interface.

Parameters
  • outputfile – The outputfilename.

  • name – The keys that should be used to computed the ‘name’ column (a tuple).

  • more_name – Additional text to add to the name (a list).

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

The FASTA class

Class declaration of the FASTA object (may be returned by GTF object methods).

class pygtftk.fasta_interface.FASTA(fn, ptr, intron, rev_comp)

A class representation of a fasta file connection. A end product from a GTF object.

transcript_as_bioseq_records()

Iterate over transcript sequences with or without intron depending on arguments provided to GTF.get_sequences(). Returns objects of class Bio.SeqRecord.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file("simple", "gtf")[0]
>>> a_gtf = GTF(a_file)
>>> genome_fa = get_example_file("simple", "fa")[0]
>>> a_fa = a_gtf.get_sequences(genome_fa)
>>> a_list = []
>>> for i in a_fa.transcript_as_bioseq_records(): a_list += [i]
>>> assert len(a_list) == 15
>>> rec = a_list[0]
write(outputfile)

Write the sequences to a fasta file.

Parameters

outputfile – Output file (file object).

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import simple_line_count
>>> a_file = get_example_file("simple", "gtf")[0]
>>> a_gtf = GTF(a_file)
>>> genome_fa = get_example_file("simple", "fa")[0]
>>> a_fa = a_gtf.get_sequences(genome_fa, intron=True)
>>> tmp_file = make_tmp_file()
>>> a_fa.write(tmp_file)
>>> tmp_file.close()
>>> assert simple_line_count(open(tmp_file.name)) == 30

The TAB class

Class declaration of the TAB object (may be returned by a GTF instance).

class pygtftk.tab_interface.TAB(fn, ptr, dll=None)

A class representation of a tabulated matrix. An end product from a GTF object.

as_data_frame()

Convert the TAB object into a dataframe.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()
>>> a_gtf = GTF(a_file)
>>> a_tab = a_gtf.extract_data("gene_id,start")
>>> assert a_tab.as_data_frame()["gene_id"].nunique() == 10
as_simple_list(which_col=0)

Convert the selected column of a TAB object into a list.

Parameters

which_col – The column number.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)[("feature", "gene")]
>>> a_tab = a_gtf.extract_data("seqid")
>>> assert list(set(a_tab.as_simple_list(0))) == ['chr1']
iter_as_list()

Iterate over the TAB object and return a list of self.nb_columns elements.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()
>>> a_gtf = GTF(a_file)
>>> a_tab = a_gtf.extract_data("gene_id,start")
>>> assert next(a_tab.iter_as_list()) == list(a_tab[0])
iterate_with_header()

Iterate over FieldSets. The first line contains the field names.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file).extract_data("all")
>>> assert next(a_gtf.iterate_with_header())[0] == 'seqid'
>>> for i in a_gtf.iterate_with_header(): pass
>>> assert 'CDS_G0010T001' in list(i)
write(outfile=None, sep='\t')

Write a tab object to a file.

Example

>>> from  pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import simple_line_count
>>> from pygtftk.utils import simple_nb_column
>>> a_file = get_example_file()[0]
>>> a_tab = GTF(a_file).extract_data("transcript_id,gene_id")
>>> out_file = make_tmp_file()
>>> a_tab.write(out_file)
>>> out_file.close()
>>> assert simple_line_count(out_file) == 70
>>> assert simple_nb_column(out_file) == 2

The Line module

This section contains representation of GTF, TAB and FASTA records (i.e, ‘lines’). When iterating over GTF, TAB or FASTA instances, the following objects will be returned respectively:

  • Feature

  • FieldSet

  • FastaSequence

class pygtftk.Line.FastaSequence(ptr=None, alist=None, feat='transcript', rev_comp=False)

A class representating a fasta file line.

format()

Return a formated fasta sequence.

Example

>>> from pygtftk.Line import FastaSequence
>>> a = FastaSequence(alist=['>bla', 'AATACAGAGAT','chr21','+', 'BLA', 'NM123', 123, 456, 'transcript'])
>>> assert a.sequence == 'AATACAGAGAT'
>>> assert a.header == '>bla'
write(file_out=None)

Write FastaSequence to a file or stdout (if file is None).

Example

>>> from pygtftk.Line import FastaSequence
>>> from pygtftk.utils import  make_tmp_file
>>> a = FastaSequence(alist=['>bla', 'AATACAGAGAT','chr21','+', 'BLA', 'NM123', 123, 456, 'transcript'])
>>> f = make_tmp_file()
>>> a.write(f)
>>> f.close()
>>> from pygtftk.utils import  simple_line_count
>>> assert simple_line_count(f) == 2
class pygtftk.Line.Feature(ptr=None, alist=None)

Class representation of a genomic feature. Corresponds to a GTF line.

add_attr(key, val)

Add an attribute to the Feature instance.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> feat.add_attr("foo", "bar")
>>> assert feat.get_attr_value('foo') == ['bar']
add_attr_and_write(key, val, outputfile)

Add a key/attribute record and write to a file.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> feat = get_example_feature()
>>> tmp_file =  make_tmp_file()
>>> feat.add_attr_and_write("foo", "bar", tmp_file)
>>> tmp_file.close()
>>> from pygtftk.gtf_interface import  GTF
>>> gtf = GTF(tmp_file.name, check_ensembl_format=False)
>>> assert gtf.extract_data("foo", as_list=True) == ['bar']
format()

Returns Feature as a string in gtf file format. Typically to write to a gtf file.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import TAB
>>> feat = get_example_feature()
>>> assert len(feat.format().split(TAB)) == 9
format_tab(attr_list, sep='\t')

Returns Feature as a string in tabulated format.

Parameters
  • attr_list – the attributes names.

  • sep – separator.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import TAB
>>> feat = get_example_feature()
>>> a = TAB.join(['chr1', 'Unknown', 'transcript', '100', '200', '.', '+', '.', 'g1t1'])
>>> assert feat.format_tab('transcript_id') == a
classmethod from_list(a_list)

Create a Feature object from a list.

Params alist

a list.

Example

>>> from pygtftk.Line import Feature
>>> alist = ['chr1','Unknown','transcript', 100, 200]
>>> from collections import OrderedDict
>>> d = OrderedDict()
>>> d['transcript_id'] = 'g1t1'
>>> d['gene_id'] = 'g1'
>>> alist +=['.','+','.',d]
>>> a = Feature.from_list(alist)
>>> assert a.attr['transcript_id'] == 'g1t1'
>>> assert a.attr['gene_id'] == 'g1'
get_3p_end()

Get the 3’ end of the feature. Returns ‘end’ if on ‘+’ strand ‘start’ otherwise (one based).

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert feat.get_3p_end() == 200
get_5p_end()

Get the 5’ end of the feature. Returns ‘start’ if on ‘+’ strand ‘end’ otherwise (one based).

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert feat.get_5p_end() == 100
>>> from pygtftk.Line import Feature
>>> from collections import OrderedDict
>>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '-', '1', OrderedDict()])
>>> assert a_feat.get_5p_end() == 200
>>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '+', '1', OrderedDict()])
>>> assert a_feat.get_5p_end() == 100
get_attr_names()

Returns the attribute names from the Feature.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert 'transcript_id' in feat.get_attr_names()
>>> assert 'gene_id' in feat.get_attr_names()
get_attr_value(attr_name, upon_none='continue')

Get the value of a basic or extended attribute.

Parameters
  • attr_name – Name of the attribute/key.

  • upon_none – Wether we should ‘continue’, ‘raise’ an error or ‘set_na’.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert feat.get_attr_value('transcript_id') == ['g1t1']
>>> assert feat.get_attr_value('chrom') == ['chr1']
>>> assert feat.get_attr_value('end') == [200]
>>> assert feat.get_attr_value('bla', upon_none='continue') == [None]
>>> assert feat.get_attr_value('bla', upon_none='set_na') == ['.']
get_gn_id()

Get value for ‘gene_id’ attribute.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert feat.get_gn_id() == 'g1'
get_tx_id()

Get value for ‘transcript_id’ attribute.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> assert feat.get_tx_id() == 'g1t1'
set_attr(key, val, upon_none='continue')

Set an attribute value of the Feature instance.

Parameters
  • key – The key.

  • val – The value.

  • upon_none – Wether we should ‘continue’, ‘raise’ an error or ‘set_na’.

Example

>>> from pygtftk.utils import get_example_feature
>>> feat = get_example_feature()
>>> feat.set_attr("gene_id", "bla")
>>> assert feat.get_gn_id() == 'bla'
write(file_out='-')

Write Feature to a file or stdout (if file is ‘-‘).

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> feat = get_example_feature()
>>> tmp_file =  make_tmp_file()
>>> feat.write(tmp_file)
>>> tmp_file.close()
>>> from pygtftk.utils import  simple_line_count
>>> assert simple_line_count(tmp_file) == 1
write_bed(name=None, format='bed6', outputfile=None)

Write the Feature instance in bed format (Zero-based).

Parameters
  • name – A string to use as name (Column 4).

  • format – Format should be one of ‘bed/bed6’ or ‘bed3’. Default to bed6.

  • outputfile – the file object were data should be printed.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import TAB
>>> from pygtftk.utils import  simple_line_count
>>> tmp_file =  make_tmp_file()
>>> feat = get_example_feature()
>>> feat.write_bed(name="foo", outputfile=tmp_file)
>>> tmp_file.close()
>>> for line in open(tmp_file.name): line= line.split(TAB)
>>> assert line[3] == 'foo'
>>> assert simple_line_count(tmp_file) == 1
write_bed_3p_end(name=None, format='bed6', outputfile=None)

Write the 3p end coordinates of feature in bed format (Zero-based).

Parameters
  • outputfile – The output file name.

  • name – The string to be used as fourth column.

  • format – Format should be one of ‘bed/bed6’ or ‘bed3’. Default to bed6.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import TAB
>>> from pygtftk.utils import  simple_line_count
>>> tmp_file =  make_tmp_file()
>>> feat = get_example_feature()
>>> feat.write_bed_3p_end(name='foo', outputfile=tmp_file)
>>> tmp_file.close()
>>> for line in open(tmp_file.name): line= line.split(TAB)
>>> assert line[1] == '199'
>>> assert line[2] == '200'
>>> assert simple_line_count(tmp_file) == 1
write_bed_5p_end(name=None, format='bed6', outputfile=None)

Write the 5p end coordinates of feature in bed format (Zero-based).

Parameters
  • outputfile – the output file.

  • name – The keys that should be used to computed the ‘name’ column.

  • format – Format should be one of ‘bed/bed6’ or ‘bed3’. Default to bed6.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import TAB
>>> from pygtftk.utils import  simple_line_count
>>> tmp_file =  make_tmp_file()
>>> feat = get_example_feature()
>>> feat.write_bed_5p_end(name='foo', outputfile=tmp_file)
>>> tmp_file.close()
>>> for line in open(tmp_file.name): line= line.split(TAB)
>>> assert line[1] == '99'
>>> assert line[2] == '100'
>>> assert simple_line_count(tmp_file) == 1
write_gtf_to_bed6(name=('transcript_id', 'gene_id'), sep='|', add_feature_type=False, outputfile=None)

Write the Feature instance in bed format (Zero-based). Select the key/values to add as a name.

Parameters
  • name – The keys that should be used to computed the ‘name’ column.

  • sep – The separator used for the name (e.g ‘gene_id|transcript_id”.

  • add_feature_type – Add the feature type to the name.

  • outputfile – the file object were data should be printed.

Example

>>> from pygtftk.utils import get_example_feature
>>> from pygtftk.utils import make_tmp_file
>>> from pygtftk.utils import TAB
>>> from pygtftk.utils import  simple_line_count
>>> tmp_file =  make_tmp_file()
>>> feat = get_example_feature()
>>> feat.write_gtf_to_bed6(outputfile=tmp_file)
>>> tmp_file.close()
>>> for line in open(tmp_file.name): line= line.split(TAB)
>>> assert len(line) == 6
>>> assert line[3] == 'g1t1|g1'
>>> assert simple_line_count(tmp_file) == 1
class pygtftk.Line.FieldSet(ptr=None, size=None, alist=None, ft_type=None)

A class representating a set of Fields obtained from a splitted line.

format(separator='\t')

Return a tabulated string.

Parameters

separator – output field separator.

Example

>>> from pygtftk.Line import FieldSet
>>> from pygtftk.utils import TAB
>>> a = FieldSet(alist=['chr1', '123', '456'])
>>> assert len(a.format().split(TAB)) == 3
write(file_out=None, separator='\t')

Write FieldSet to a file or stdout (if file is None).

Parameters
  • separator – output field separator.

  • file_out – output file object.

Example

>>> from pygtftk.Line import FieldSet
>>> from pygtftk.utils import  make_tmp_file
>>> from pygtftk.utils import  simple_line_count
>>> from pygtftk.utils import  simple_nb_column
>>> a = FieldSet(alist=['chr1', '123', '456'])
>>> f = make_tmp_file()
>>> a.write(f)
>>> f.close()
>>> assert simple_line_count(f) == 1
>>> assert simple_nb_column(f) == 3

The pygtftk.stats.intersect.overlap_stats_shuffling module

This is the main function to compute random shuffles to assess statistical significance overlap of two sets of genomic regions, provided as BED files.

Called directly by the ologram.py plugin. All other functions calls descend from this one.

Author : Quentin Ferré <quentin.q.ferre@gmail.com>

class pygtftk.stats.intersect.overlap_stats_shuffling.ComputingIntersectionPartial(Lr1, Li1, Lrs, Lis, all_chrom1, all_chrom2, use_markov_shuffling, keep_intact_in_shuffling, nb_threads)

This is a replacer for a wrapper for compute_all_intersections_minibatch, needed for multiprocessing below This was needed because the argument that changes in multiproessing in minibatch_len, and it’s not the leftmost argument, so functools.partial cannot be used

pygtftk.stats.intersect.overlap_stats_shuffling.compute_all_intersections_minibatch(Lr1, Li1, Lrs, Lis, all_chrom1, all_chrom2, minibatch_size, use_markov_shuffling, keep_intact_in_shuffling, nb_threads, seed=42)

Main processing function. Computes a minibatch of shuffles for the given parameters.

This function will be called by the hub function compute_overlap_stats to create a batch of shuffled “fake” BED files. The Lrs and Li and all_chroms are all outputs from the bed_to_lists_of_intervals() function calls : those are the lists of region lengths, inter-region lengths, and chromosomes for each of the two input files.

Lrs and Lis are lists containing [Lr2] and [Li2] if there is only one query set, but can contain [Lr2, Lr3, …] if there is more than one.

Parameters
  • Lr1 – An output from the bed_to_lists_of_intervals() function calls.

  • Li1 – An output from the bed_to_lists_of_intervals() function calls.

  • Lrs – A list of outputs from the bed_to_lists_of_intervals() function calls.

  • Lis – A list of outputs from the bed_to_lists_of_intervals() function calls.

  • all_chrom1 – An output from the bed_to_lists_of_intervals() function calls.

  • all_chrom2 – An output from the bed_to_lists_of_intervals() function calls.

  • minibatch_size – The size of the batchs for shuffling.

  • use_markov_shuffling – Use a classical or a order-2 Markov shuffling.

  • keep_intact_in_shuffling – Among the Lrs (and Lis), those whose number/order is here will be kept intact during the shuffling (won’t be shuffled)

  • nb_threads – number of threads.

pygtftk.stats.intersect.overlap_stats_shuffling.compute_overlap_stats(bedA, bedsB, chrom_len, minibatch_size, minibatch_nb, bed_excl, use_markov_shuffling, keep_intact_in_shuffling, nb_threads, ft_type, multiple_overlap_target_combi_size=None, multiple_overlap_max_number_of_combinations=None, multiple_overlap_custom_combis=None, draw_histogram=False, exact=False)

This is the hub function to compute overlap statistics through Monte Carlo shuffling with integration of the inter-region lengths.

The function will generate shuffled BEDs from bedA and bedB independantly, and compute intersections between those shuffles. As such, it gives an estimation of the intersections under the null hypothesis (the sets of regions given in A and B are independant).

Author : Quentin FERRE <quentin.q.ferre@gmail.com>

Parameters
  • bedA – The first bed file in pybedtools.Bedtool format.

  • bedsB – The second bed file. Can also be a list of bed files for multiple overlaps.

  • chrom_len – the dictionary of chromosome lengths

  • minibatch_size – the size of the minibatch for shuffling.

  • minibatch_nb – The number of minibatchs.

  • bed_excl – The regions to be excluded.

  • use_markov_shuffling – Use a classical or a order-2 Markov shuffling.

  • keep_intact_in_shuffling – those numbers in bedsB will be kept intact/fixed during the shuffliing

  • nb_threads – Number of threads.

  • ft_type – The name of the feature.

  • multiple_overlap_target_combi_size – For multiple overlaps, maximum number of sets in the output combinations.

  • multiple_overlap_max_number_of_combinations – For multiple overlaps, maximum number of combinations to consider. This will use the MOLD mining algorithm. Do not ask for too many.

  • multiple_overlap_custom_combis – For multiple overlaps, skips combination mining and computes stats directly for these combinations. Path to a file to be read as NumPy matrix.

  • draw_histogram – if True, draws a temp file histogram for each combi

  • exact – if True, when performing the statistics, an observation of A+B+C counts as an observation of A+B

The pygtftk.stats.intersect.overlap_stats_compute module

Compute overlap statistics on shuffled sets. Called by overap_stats_shuffling.compute_overlap_stats(), hence the name.

Those functions tend to take as input lists of shuffles and output statistics.

Author : Quentin Ferré <quentin.q.ferre@gmail.com>

class pygtftk.stats.intersect.overlap_stats_compute.CombinationExactMapping(all_combis, mapping)

A wrapper containing a list of all combinations and a vector giving, for each combination, the ID of all other combinations that are a match for it.

For example, if working with inexact combinations, [1,1,1] is a match when querying [1,1,0]

This is done to save RAM when working with very long combinations.

Example :

>>> all_combis = [(1,1,0),(1,1,1)]
>>> mapping = {(1,1,0):[1]}
>>> cm = CombinationExactMapping(all_combis, mapping)
>>> assert cm.get_all((1,1,0)) == [(1,1,1)]
>>> assert cm.get_all((1,1,1)) == []
class pygtftk.stats.intersect.overlap_stats_compute.Counter(initval=0)

Multiprocessing-compatible counter that can be incremented.

class pygtftk.stats.intersect.overlap_stats_compute.DictionaryWithIndex(data, index, data_default_factory=<function DictionaryWithIndex.<lambda>>, will_store_an_all_overlaps_object=False)

A wrapper allowing to query a dictionary so that an item can also return values from several other items. The dictionary will return concatenated values from several keys when asked for.

It will return values from itself PLUS everything in the index

Specifically designed for lists of overlaps in OLOGRAM-MODL, meaning the final result will be a list with the same number elements, whose i-th element for i in 1..n will be the concatenated i-th elements of all candidate lists

Example :

>>> all_combis = ['A','B','C']
>>> mapping = {'A':[1,2]}
>>> index = CombinationExactMapping(all_combis, mapping)
>>> d = { 'A':[[1,1],[1],[1,1]], 'B':[[2],[2,2],[2]], 'C':[[3,3],[3],[3]] }
>>> di = DictionaryWithIndex(d, index)
>>> assert di.get_all('A') == [[1,1,2,3,3], [1,2,2,3], [1,1,2,3]]
>>> assert di.get_all('B') == [[2],[2,2],[2]]
get_simple_concatenation(key)

For the true overlaps. Do not merge individual shuffles, there are no shuffles to be merged. Simply concatenate the lists.

class pygtftk.stats.intersect.overlap_stats_compute.HashableArray(input_array)

A subclass of NumPy’s ndarray that can be used as dictionary key. A dictionary of such arrays will consume less RAM than a tuple, especially when pickled.

This is however immutable, and should not be used on arrays that you intend to modify.

NOTE It is hashed at creation, so creation can take a bit of time. Use a tuple if that is unacceptable.

class pygtftk.stats.intersect.overlap_stats_compute.SparseListOfLists(starting_index=0)
Container for a list of this type :
[

[elements], [], [], [], [other_elements], [], [], …

]

Meaning, a list of lists where many of the lists inside will be empty. This helps save RAM.

Example:

>>> l = SparseListOfLists()
>>> l.put([])
>>> l.put(['Hello'])
>>> l.put([])
>>> assert [i for i in l] == [[],['Hello'],[]]
>>> assert l[0] == []
>>> assert l[1] == ['Hello']
>>> l[0] = ['Ha']
>>> l[1] = ['Ho']
>>> assert [i for i in l] == [['Ha'],['Ho'],[]]
>>> assert list(l) == [i for i in l]
put(element)

Add an element at the tail of the SparseListOfLists

pygtftk.stats.intersect.overlap_stats_compute.compute_stats_for_intersection(myintersect)

Wrapper to compute all stats we want on a single intersect result object. The argument (myintersect) is a single bedfile, either as a pybedtools intersect result, or a list of tuples/lists like [[‘chr1’,10,20],[‘chr2’,30,40]].

pygtftk.stats.intersect.overlap_stats_compute.compute_true_intersection(bedA, bedsB)

Returns the custom-computed true intersection between bedA and all in bedsB combined, where bedA is a pybedtools.BedTool object and bedsB is a list of such objects.

Returns also the intersection flags.

pygtftk.stats.intersect.overlap_stats_compute.get_index_if_present(mylist, x)

Locate the leftmost value exactly equal to x in a sorted list, otherwise returns None

Example:

>>> L = [1,2,5,6,8]
>>> assert get_index_if_present(L, 2) == 1
>>> assert get_index_if_present(L, 4) == None
pygtftk.stats.intersect.overlap_stats_compute.get_items_by_indices_in_list(indices, mylist)

Quick way to perform multiple indexing on a list. Unlike the operator.itemgetter method, will always return a list

Example: >>> assert get_items_by_indices_in_list(indices = [0], mylist = [1,2,3]) == [1]

pygtftk.stats.intersect.overlap_stats_compute.index_all_these(combis_to_index, exact)

Helper function to run the global numpy analogue of which_combis_to_get_from on a list of combis.

This is not a pure function and relies on several external parameters that will be named as globals later, before this function is called. Those parameters are stored in the “dummy” module.

NOTE : the oc.NPARRAY_which_combis_match_with already integrates a check and will not return the same index as the query

pygtftk.stats.intersect.overlap_stats_compute.merge_consecutive_intersections_in_all_overlaps_lists(aiqc)

Merges the consecutive intersections in a list-of-intersection-batches object.

This is an in-place operation: it will not return a new list, but change the original !

Example:

>>> all_intersections_queried_for_this_combi = [ [(b'chr1',1,100),(b'chr1',100,200),(b'chr1',200,300)], [(b'chr1',100,200),(b'chr1',200,300),(b'chr1',600,700)], [(b'chr1',100,200),(b'chr2',200,300)] ]
>>> expected = [ [(b'chr1',1,300)], [(b'chr1',100,300),(b'chr1',600,700)], [(b'chr1',100,200),(b'chr2',200,300)] ]
>>> merge_consecutive_intersections_in_all_overlaps_lists(all_intersections_queried_for_this_combi)
>>> assert all_intersections_queried_for_this_combi == expected
pygtftk.stats.intersect.overlap_stats_compute.merge_consecutive_intersections_in_intersections_list(inter_list)

Merges the consecutive intersections in a list-of-intersections object.

Note that this will discard the flags (4th element, np array) and any other element in the list of lists.

This is NOT an in-place operation, and returns the new list.

pygtftk.stats.intersect.overlap_stats_compute.stats_multiple_overlap(all_overlaps, bedA, bedsB, all_feature_labels, nb_threads=8, nofit=False, multiple_overlap_target_combi_size=None, multiple_overlap_max_number_of_combinations=None, multiple_overlap_custom_combis=None, draw_histogram=False, exact=False)

Instead of returning one set of overlap stats per query type (ie. exons, gens, bedfile1, bedfile2, etc…) it will return one per multiple overlap (ie. here there was a peak for bedfile1+gene, or bedfile1+bedfile2+gene, etc.)

Only do so for “common/interesting combis”, found by dict learning on original data.

Parameters

all_overlaps – The list of all overlaps computed for all shuffles

pygtftk.stats.intersect.overlap_stats_compute.stats_single(all_intersections_for_this_combi, true_intersection, ft_type='some feature', nofit=False, this_combi_only=None, draw_histogram=False)

Compute statistics such as total number of overlapping base pairs for a given feature.

Parameters
  • intersections_for_this_combi – an object returned by compute_all_intersections_minibatch giving all intersections between the shuffled bedA and bedsB.

  • true_intersection – the result of overlap_stats_compute.compute_true_intersection(bedA, bedsB) where bedA is the query and bedB is the list of the bed files between whose shuffles the aforementioned intersections have been computed. Used here to calculate the true intersections between them and calculate a Neg Binom p-value.

  • ft_type – for debug messages, which feature/combi are we currently processing ?

  • nofit – if True, does not do Negative Binomial fitting

  • this_combi_only – a list of flags (e.g. [1,0,0,1]) corresponding to expected flags in the intersections, one per file (see find_intersection() source and documentation). If not None, we will consider only intersections that have this flag for the number of true intersections and true overlapping basepairs

  • draw_histogram – if True, draws a temp file histogram for each combi

pygtftk.stats.intersect.overlap_stats_compute.which_combis_to_get_from(combi, all_possible_combis, exact)

To build an index later, determine the supplementary combis : meaning, for each combination, which other combinations needs to be queried.

Relevant if we desire inexact or exact combis; Reminded : a combi A is a “inexact” match for combi B if all flags of combi B are also present in combi A, along with potentially others.

Returns an index giving the supplementary combinations.

Example :

>>> all_combis = [(1,1,0),(1,1,1),(0,0,1)]
>>> c,r = which_combis_to_get_from((1,1,0), all_combis, exact = False)
>>> assert r.tolist() == [1]

The pygtftk.stats.negbin_fit module

Contains various utility functions relative to the negative binomial distribution.

pygtftk.stats.negbin_fit.check_negbin_adjustment(obs, mean, var, bins_number=16)

Considers a Negative Binomial distribution of given mean and var, and performs an adjustement test of this distrubution with regards to obs.

Returns the result of a Cramer’s V adjustment test.

A classical Chi-squared test cannot be used, since in a typical use case sample size will be only 250 (shuffles) and as such expected frequency for each value will be well under 5. As such, we perform the Chi-Squared test on an equivalent “histogram” : instead of computing the expected number of variates exactly equal to, say, 22, we compute how many variates fall in the 20-30 range. Target bin number is ajustable, final bin number may vary due to rounding.

Futhermore, the Chi-Squared test can artificially return (H0 false) if n is too large. Hence, we use Cramer’s V test instead.

The function returns 1 - V, where V is the Cramer’s V test of a crosstab made by concatenating the observed counts and expected counts vectors.

The V test is here used to determine whether there is an association between counts distribution and the status as ‘expected’ or ‘observed’. As per Cramer (1948) a good fit should have a fit quality above 1 - 0.25 = 0.75 because V > 0.25 would mean association in most cases. Typically, you will observe good association when len(obs) is in the thousands. When it is in the hundreds only, random chance may result in a okay-ish association (around 0.8). Conversely, if len(obs) is in the thousands, only fits above 0.9 should be considered good.

Parameters
  • obs – Observed values.

  • mean – The mean for the negative binomial model.

  • var – The variance for the negative binomial model.

  • bins_number – The number of bins in histogram.

Example

>>> import numpy as np
>>> import scipy.stats
>>> from pygtftk.stats.negbin_fit import check_negbin_adjustment
>>> np.random.seed(42)
>>> mean = 100 ; var = 200
>>> r = mean**2 / (var-mean) ; p = 1/(mean/r + 1)
>>> rv = scipy.stats.nbinom(r, p)
>>> obs = rv.rvs(2000)
>>> fit = check_negbin_adjustment(obs, mean, var)
>>> fit = np.around(fit, 2)
>>> assert fit > 0.95
pygtftk.stats.negbin_fit.empirical_p_val(x, data)

Quick wrapper : empirical two-sided p value.

Returns the proportion of elements greater than x (or smaller than x) in the data (resp. whichever proportion is smaller). This can be used with any dataset, not just a negative-binomial-compliant one.

pygtftk.stats.negbin_fit.negbin_pval(k, mean, var, precision=320, ft_type='Unknown')

P-value for a negative binomial distribution of the given moments (mean, var).

This is the two-sided p-value : it will return the minimum of the left-sided and right-sided p-value

NOTE : To prevent division by zero or negative r, if the mean is higher than or equal to the variance, set the variance to mean + epsilon and send a warning

Parameters
  • k – the critical value for which the pvalue is computed.

  • mean – The mean for the negative binomial model.

  • var – The variance for the negative binomial model.

  • precision – Floating point precision of mpmath. Should be at least 1000

  • ft_type – The name of the feature to be tested (just for meaningful messages).

>>> from pygtftk.stats.negbin_fit import negbin_pval
>>> mean = 18400
>>> var = 630200
>>> k = 65630
>>> pval = negbin_pval(k, mean, var)
>>> import math
>>> assert(math.isclose(pval,1.1999432787236828e-307))

The pygtftk.stats.beta module

class pygtftk.stats.beta.BetaCalculator(precision=320, use_log=True, itermax=50000, ft_type='Unknown')

Computing the beta distribution in Python using mpmath.

Using routines from the GNU Scientific Library (beta_inc.c): <Copyright (C) 2007 Brian Gough and Copyright (C) 1996, 1997, 1998, 1999, 2000 Gerard Jungman>

Available under the GNU GPL v3 license.

Formulas :

self.betainc(A, B, p) <–> mpmath.betainc(A, B, 0, p) <–> mpmath.quad(lambdat: t**(A-1)*(1-t)**(B-1), [0, p])

Example

>>> from pygtftk.stats.beta import BetaCalculator
>>> import mpmath
>>> mycalc = BetaCalculator()
>>> res_A = float(mpmath.beta(5,2))
>>> res_B = float(mycalc.beta(5,2))
>>> assert(res_A == res_B)
>>> res_A = float(mpmath.betainc(a=5,b=2,x2=0.5))
>>> res_B = float(mycalc.betainc(a=5,b=2,x=0.5))
>>> assert(res_A == res_B)
beta(a, b)

Uses either mpmath’s gamma or log-gamma function to compute values of the beta function.

betainc(a, b, x)

Non-regularized incomplete beta. See betaincreg().

betaincreg(a, b, x)

betaincreg(a,b,x) evaluates the incomplete beta function (regularized). It requires a, b > 0 and 0 <= x <= 1.

Code translated from: GNU Scientific Library

contfractbeta(a, b, x)

Evaluates the continued fraction form of the incomplete Beta function.

Code translated from: GNU Scientific Library

Uses the modified Lentz’s method.

You can see a representation of this form in the Digial Library of Mathematical functions <https://dlmf.nist.gov/8.17#SS5.p1>. The goal of the method is to calculate the successive ‘d’ terms, separately for odd and even.

pygtftk.stats.beta.beta_pval(k, obs, precision=320)

P-value for the given critical value against a beta distribution fitted to the given list of observations.

This is the two-sided p-value : it will return the minimum of the left-sided and right-sided p-value

Parameters
  • k – the critical value whose p-value will be calculated

  • obs – the list of observations to which the beta distribution will be fitted

  • precision – Floating point precision of mpmath. Should be at least 1000

>>> from pygtftk.stats.beta import beta_pval
>>> from scipy.stats import beta
>>> import numpy.testing as npt
>>> a, b = 1., 2.
>>> np.random.seed(seed=42)
>>> obs = beta.rvs(1, 2, size=10000)
>>> k = 0.6
>>> p = 1 - beta.cdf(k,a,b)
>>> phat = beta_pval(k, obs)                        # Test the combined package
>>> x = 0.9999
>>> cp = beta.cdf(x,a,b)
>>> mybetacalc = BetaCalculator()
>>> cphat = mybetacalc.betaincreg(a=a, b=b, x=x)    # Test just the p-value
>>> npt.assert_allclose(p, phat, rtol=0.1)          # Beta approximation is too imprecise for extreme p-values, however.
>>> npt.assert_allclose(float(cp), float(cphat), rtol=0.05)
pygtftk.stats.beta.fit_beta(obs)

Fits a four-parameter Beta distribution to the given list of observations using method-of-moments fitting.

>>> from scipy.stats import beta
>>> import numpy.testing as npt
>>> import numpy as np
>>> from pygtftk.stats.beta import fit_beta
>>> a, b = 1., 2.
>>> np.random.seed(seed=42)
>>> obs = beta.rvs(a, b, size=10000)  # You need at least 10K for a good estimate (!)
>>> ahat, bhat, mhat, chat = fit_beta(obs)
>>> npt.assert_allclose((ahat, bhat), (a,b), rtol = 0.05)

The pygtftk.stats.intersect.modl.tree module

A tree (or more accurately, graph) based representation of combinations of elements.

Combinations are represented as tuples. For example, if the possible elements are {A,B,C,D}, the combination A+C is represented as (1,0,1,0).

Nodes will be assigned under the following principle : each node X that contains all flags of a node Y. For example, (1,0,0) is a parent of (1,0,1) but not the other way around.

A Library is a set set of nodes starting with a root of (0,0,0,…)

Author : Quentin Ferré <quentin.q.ferre@gmail.com>

class pygtftk.stats.intersect.modl.tree.Library

A tree (or more accurately, graph) based representation of combinations of elements.

Combinations are represented as tuples. For example, if the possible elements are {A,B,C,D}, the combination A+C is represented as (1,0,1,0).

Nodes will be assigned under the following principle : each node X that contains all flags of a node Y. For example, (1,0,0) is a parent of (1,0,1) but not the other way around.

A Library is a set set of nodes starting with a root of (0,0,0,…)

Example of use :

>>> from pygtftk.stats.intersect.modl.tree import Library
>>> words = [(1,1,0),(1,0,0),(0,1,0)]
>>> l = Library()
>>> l.build_nodes_for_words(words)
>>> l.assign_nodes()
build_nodes_for_words(words)

Builds unassigned nodes for all words.

words must be a list of tuples like [(1,0,0),(1,1,0)]

build_nodes_for_words_from_ologram_result_df(result_df, query_name='Query')

Used for graphical display later on

The nodes should also have a pval and fc and s that can be given and that we can query later on So the build_nodes_for_words function can also take a pandas dataframe that is an OLOGRAM output !!

class pygtftk.stats.intersect.modl.tree.Node(word)

Represents a node of the tree. Each node is a word. A word must be a tuple such as (1,1,0,0) meaning A+B if the possibilities are {A,B,C,D}

pygtftk.stats.intersect.modl.tree.apply_recursively_to_all_nodes(node, function, global_results, no_duplicates=True, stop_condition=<function <lambda>>)

General utility function to recursively apply a function to all nodes of a tree. Also pass a global_results dict to be added to.

Since nodes can have several parents, by default this will remember which nodes have been seen and not operate on them twice (no_duplicates).

A stop condition fonction of signature stop_condition(current_node, global_results) can be passed : the recursion will stop if it returns True at any point.

>>> from pygtftk.stats.intersect.modl.tree import Library, apply_recursively_to_all_nodes
>>> from pygtftk import utils
>>> utils.VERBOSITY = 0
>>> words = [(1,0,0), (1,1,0), (0,1,0), (1,1,1)]
>>> l = Library()
>>> l.build_nodes_for_words(words)
>>> l.assign_nodes()
>>> manual_print_words = set([str(n) for n in l.assigned_nodes])
>>> gr = dict()
>>> apply_recursively_to_all_nodes(l.root_node, str, gr)
>>> recursive_print_words = set(gr.values())
>>> assert recursive_print_words == manual_print_words
pygtftk.stats.intersect.modl.tree.get_all_candidates_except(library, exclude)

Returns the words of all nodes, except those words that are in the exclusion list

pygtftk.stats.intersect.modl.tree.output_visualize(tree, output_path, features_names=None)

Output a visualisation a a given Library (the tree argument)

Parameters
  • tree – : The library to write

  • output_path – Path to write to

  • features_names – Conversion key giving the name of each feature in the vector. For example if features_names = [‘A’,’B’,’C’], (0,1,1) will be translated as B+C

The pygtftk.stats.intersect.modl.dict_learning module

This module contains the MODL algorithm, an itemset mining algorithm that broadly consists of two steps.

Considering an input matrix with transactions/regions.intersections in lines and elements/items/sets in columns, the algorithm will : (I) learn (MiniBatchDictionaryLearning) several factorisations with different sparsity constraints to build a library of candidate words (eg. (1,1,0,0,1)) and (II) select the best words to rebuild the original matrix using a greedy algorithm, by trying to rebuild the original matrix with a subset of selected words, adding the word that best improves the rebuilding to said subset each time.

Dictionary learning is an optimization problem solved by alternatively updating the sparse code, as a solution to multiple Lasso problems, considering the dictionary fixed, and then updating the dictionary to best fit the sparse code.

In OLOGRAM, we use this on the matrix of true overlap flags to find usual common overlaps.

This is mostly useful if there are many files to reduce the number of displayed combinations, and to counteract the effect of noise on the data. Unlike classical association rules mining algorithms, this focuses on mining complexes and correlation groups (item sets).

Author : Quentin FERRE <quentin.q.ferre@gmail.com>

This code is available under the GNU GPL license.

class pygtftk.stats.intersect.modl.dict_learning.Modl(flags_matrix, multiple_overlap_target_combi_size=- 1, multiple_overlap_max_number_of_combinations=5, nb_threads=1, step_1_factor_allowance=2, error_function=None, smother=True, normalize_words=True, step_2_alpha=None, discretization_threshold=0, step_1_alphas=None)

This class encapsulates the MODL approach :

Takes as input a matrix of flags, with one flag per intersection, and returns the list of interesting combis.

For more details, please see the module description of pygtftk/stats/intersect/modl/dict_learning.py

Parameters
  • flags_matrix – Matrix of overlaps

  • multiple_overlap_target_combi_size – Candidates longer than this will be discarded

  • multiple_overlap_max_number_of_combinations – Final desired number of combinations

  • nb_threads – Number of threads

  • step_1_factor_allowance – In step 1 of building the candidates, how many words are allowed in the Dictionary Learning as a proportion of multiple_overlap_max_number_of_combinations

  • error_function – error function used in step 2. Default to manhattan error.

  • smother – Should the smothering which reduces each row’s abudane to its square root to emphasize rarer combinations be applied ? Default is True

  • normalize_words – Normalize the words by their summed squares in step 2. Default True.

  • step_2_alpha – Override the alpha used in step 2.

  • discretization_threshold – discretization_threshold in step 1. In each atom, elements below D*maximum_for_this_atom will be discarded. Optional.

  • step_1_alphas – A list to manually override the alphas to be used during step 1. Optional.

Passing a custom error function, it must have the signature error_function(X_true, X_rebuilt, encoded, dictionary). X_true is the real data, X_rebuilt is the reconstruction to evaluate, code is the encoded version (in our case used to check sparsity), dictionary has one learned atom per row.

>>> from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
>>> import numpy as np
>>> np.random.seed(42)
>>> flags_matrix = test_data_for_modl(nflags = 1000, number_of_sets = 6, noise = 0.05, cor_groups = [(0,1),(0,1,2,3),(4,5)])
>>> combi_miner = Modl(flags_matrix, multiple_overlap_max_number_of_combinations = 3)
>>> interesting_combis = combi_miner.find_interesting_combinations()
>>> assert set(interesting_combis) == set([(1,1,0,0,0,0),(1,1,1,1,0,0),(0,0,0,0,1,1)])
find_interesting_combinations()

MAIN FUNCTION. Will call the others.

generate_candidate_words()

Generate the library by running dictionary-learning based matrix factorizations on the data matrix

select_best_words_from_library()

This is step 2. Takes the library of candidates produced at step 1 and will get the best N words among it that best rebuild the original matrix.

pygtftk.stats.intersect.modl.dict_learning.squish_matrix(x, abundance_threshold=0, shuffle=True, smother=True)

To reduce redundancy in the matrix lines, take all unique rows of X and build a squished matrix where each line now has the square root of its original abundance divided by sqrt(abundance of most rate), but not lower than abundance_threshold, 1/10000 by default. We use the square root of those abudances instead to dimnish the emphasis on the most frequent combinations by default (smothering).

>>> import numpy as np
>>> from pygtftk.stats.intersect.modl.dict_learning import squish_matrix
>>> X = np.array([[1,1,0,0]]*1000 + [[0,0,1,1]]*100)
>>> X_squished = squish_matrix(X, shuffle = False, smother = True)
>>> np.testing.assert_equal(X_squished, np.array([[0,0,1,1]]*1 + [[1,1,0,0]]*4)) # Note that the rows have been sorted by abundance   
pygtftk.stats.intersect.modl.dict_learning.test_data_for_modl(nflags=1000, number_of_sets=6, noise=0, cor_groups=[(0, 1), (0, 1, 2, 3), (4, 5)])

Generate testing data for the dictionary learning module.

The pygtftk.stats.intersect.modl.subroutines module

Pure subroutines of the MODL algorithm.

Author : Quentin Ferré <quentin.q.ferre@gmail.com>

pygtftk.stats.intersect.modl.subroutines.build_best_dict_from_library(data, library, queried_words_nb, error_function=None, nb_threads=1, normalize_words=False, transform_alpha=None)

Given a data matrix and a library, will select the best n = queried_words_nb words with a greedy algorithm from the library to rebuild the data.

data is a matrix with one transaction per row and one element per column, as usual

The greeedy algorithm works because the problem in this particular instance is submodular. In broad strokes, it is assumed that the gain in term of reconstruction with the Sparse Coder of adding the word X to a set S is no higher than adding X to a set T that contains S. Also, more words cannot make the sparse coder do a worse job so the function is monotonous.

Instead of the reconstruction error, you may pass a different callable of the form error_function(data, rebuilt_data, encoded, dictionary) that returns an error value so there can be a supervision, but the submodularity might no longer hold.

>>> import numpy as np
>>> from pygtftk.stats.intersect.modl import tree
>>> from pygtftk.stats.intersect.modl import subroutines as modl_subroutines
>>> X = np.array([[1,1,0,0,0,0],[1,1,1,1,0,0],[0,0,0,0,1,1]]*10)
>>> candidates = [(1,1,0,0,0,0),(1,1,1,1,0,0),(0,0,0,0,1,1), (1,1,0,0,1,1), (1,0,1,0,1,0),(1,1,0,1,1,0),(1,0,1,1,0,0),(0,1,0,0,1,1)]
>>> L = tree.Library()
>>> L.build_nodes_for_words(candidates)
>>> L.assign_nodes()
>>> selection = modl_subroutines.build_best_dict_from_library(X, L, queried_words_nb = 3)
>>> assert set(selection) == set([(1,1,0,0,0,0),(1,1,1,1,0,0),(0,0,0,0,1,1)])
pygtftk.stats.intersect.modl.subroutines.generate_candidate_words(X, n_words, nb_threads=1, discretization_threshold=0, alphas=None)

Given a data matrix, will generate a library of candidate words by trying many different Dictionary Learnings (different alphas only for now)

pygtftk.stats.intersect.modl.subroutines.learn_dictionary_and_encode(data, n_atoms=20, alpha=0.5, n_iter=200, random_seed=42, n_jobs=1, fit_algorithm='cd', transform_algorithm='lasso_cd')

Will learn a dictionary for the data (row-wise) and encode it, with the specified parameters. Returns the dictionary, components as a dataframe, and encoded data.

By default, allows 20 words with alpha of 0.5.

More info at : https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchDictionaryLearning.html

>>> from pygtftk.stats.intersect.modl.dict_learning import test_data_for_modl
>>> from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
>>> import numpy as np
>>> np.random.seed(42)
>>> flags_matrix = test_data_for_modl(nflags = 1000, number_of_sets = 6)
>>> import time
>>> start_time = time.time()
>>> U_df, V_df, error = learn_dictionary_and_encode(flags_matrix, n_atoms = 20, alpha = 0.5)
>>> stop_time = time.time()
pygtftk.stats.intersect.modl.subroutines.normalize_and_jitter_matrix_rows(X, normalize=True, jitter=True)

Apply normalization so that sum of suquare of each row is 1, Add a slight epsilon so two rows may not have exact same dot product with a given vector Sort in lexicographic order

>>> import numpy as np
>>> import numpy.testing as npt
>>> from pygtftk.stats.intersect.modl.subroutines import normalize_and_jitter_matrix_rows
>>> D = np.array([[1,1,0],[0,1,0],[0,1,1]])
>>> Dc = normalize_and_jitter_matrix_rows(D)
>>> Dc_theory = [[0, 1, 0], [7.0736E-5, 7.07107E-1, 7.07107E-1], [7.07107E-1, 7.07107E-1, 9.99859E-05]]
>>> npt.assert_almost_equal(Dc, Dc_theory, decimal = 5)

The gtftk.utils module

A set of useful functions.

exception pygtftk.utils.GTFtkError(value)
exception pygtftk.utils.GTFtkInteractiveError
pygtftk.utils.add_prefix_to_file(infile, prefix=None)

Returns a file object or path (as string) in which the base name is prefixed with ‘prefix’. Returns a file object (write mode) or path depending on input type.

Parameters
  • infile – A file object or path.

  • prefix – the prefix to add.

Example

>>> from pygtftk.utils import add_prefix_to_file
>>> result = 'data/simple/bla_simple.gtf'
>>> assert add_prefix_to_file('data/simple/simple.gtf', "bla_") == result
pygtftk.utils.check_boolean_exprs(exprs=None, operand=(), send_error=True)

Check whether a boolean expression is properly formed.

Parameters
  • exprs – The string to evaluate.

  • operand – The name of the operands.

  • send_error – Whether to throw an error if expression is malformed.

Returns

A boolean.

Example

>>> from pygtftk.utils import check_boolean_exprs
>>> assert check_boolean_exprs('s > 1 and (s < 2 or y < 2.5)', operand=['s', 'y'])
pygtftk.utils.check_file_or_dir_exists(file_or_dir=None)

Check if a file/directory or a list of files/directories exist. Raise error if a file is not found.

Parameters

file_or_dir – file object or a list of file object.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.utils import check_file_or_dir_exists
>>> assert check_file_or_dir_exists(get_example_file()[0])
>>> assert check_file_or_dir_exists(get_example_file())
pygtftk.utils.check_r_installed()

Check R is installed.

Example

>>> from pygtftk.utils import check_r_installed
>>> # check_r_installed()
pygtftk.utils.check_r_packages(r_pkg_list=None, no_error=True)

Return True if R packages are installed. Return False otherwise.

Parameters
  • no_error – don’t raise an error if some packages are not found.

  • r_pkg_list – the list of R packages.

Example

>>> from pygtftk.utils import check_r_packages
pygtftk.utils.chomp(string)

Delete carriage return and line feed from end of string.

Example

>>> from pygtftk.utils import chomp
>>> from pygtftk.utils import NEWLINE
>>> assert NEWLINE not in chomp("blabla\r\n")
pygtftk.utils.chrom_info_as_dict(chrom_info_file)

Check the format of the chromosome info file and return a dict.

Parameters

chrom_info_file – A file object.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.utils import chrom_info_as_dict
>>> a = get_example_file(ext='chromInfo')[0]
>>> b = chrom_info_as_dict(open(a, "r"))
>>> assert b['chr1'] == 300
>>> assert b['all_chrom'] == 900
pygtftk.utils.chrom_info_to_bed_file(chrom_file, chr_list=None)

Return a bed file (file object) from a chrom info file.

Parameters
  • chrom_file – A file object.

  • chr_list – A list of chromosome to be printed to bed file.

>>> from  pygtftk.utils import chrom_info_to_bed_file
>>> from  pygtftk.utils import get_example_file
>>> from pybedtools import  BedTool
>>> a = get_example_file(ext='chromInfo')
>>> b = chrom_info_to_bed_file(open(a[0], 'r'))
>>> c = BedTool(b.name)
>>> chrs = []
>>> for i in c: chrs += [i.chrom]
>>> assert 'chr1' in chrs and 'chr2' in chrs
>>> starts = []
>>> for i in c: starts += [i.start]
>>> assert 0 in starts
>>> ends = []
>>> for i in c: ends += [i.end]
>>> assert 300 in ends and 600 in ends
pygtftk.utils.close_properly(*args)

Close a set of file if they are not None.

Example

>>> from pygtftk.utils import close_properly
pygtftk.utils.flatten_list(x, outlist=None)

Flatten a list of lists.

Parameters
  • x – a list or list of list.

  • outlist – The output list.

Example

>>> from pygtftk.utils import flatten_list
>>> b = ["A", "B", "C"]
>>> a = ["a", "b", "c"]
>>> c = ['a', 'b', 'c', 'A', 'B', 'C', 'string']
>>> # Call it with an empty list (outlist)
>>> # otherwise, element will be added to the existing
>>> # outlist variable
>>> assert flatten_list([a,b, "string"], outlist=[]) ==  c
>>> assert flatten_list("string", outlist=[]) == ['string']
pygtftk.utils.flatten_list_recur(a_list, sep=' ')

Join a list of list. :param a_list: a list. :param sep: the separator.

pygtftk.utils.get_example_feature()

Returns an example Feature (i.e GTF line).

Example

>>> from pygtftk.utils import get_example_feature
>>> a = get_example_feature()
pygtftk.utils.get_example_file(datasetname='simple', ext='gtf')

Get the path to a gtf file example located in pygtftk library path.

Parameters
  • datasetname – The name of the dataset. One of ‘simple’, ‘mini_real’, ‘simple_02, ‘simple_03’.

  • ext – Extension. For ‘simple’ dataset, can be one of ‘bw’, ‘fa’, ‘fa.fai’, ‘chromInfo’, ‘bt*’, ‘fq’, ‘gtf’ or ‘.*’.

Example

>>> from pygtftk.utils import get_example_file
>>> a= get_example_file()
>>> assert a[0].endswith('gtf')
>>> a= get_example_file(ext="bam")
>>> assert a[0].endswith('bam')
>>> a= get_example_file(ext="bw")
>>> assert a[0].endswith('bw')
pygtftk.utils.head_file(afile=None, nb_line=6)

Display the first lines of a file (debug).

Parameters
  • afile – an input file.

  • nb_line – the number of lines to print.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.utils import head_file
>>> my_file = open(get_example_file()[0], "r")
>>> #head_file(my_file)
pygtftk.utils.intervals(l, n, silent=False)

Returns a list of tuple that correspond to n ~ equally spaced intervals between 0 and l. Note that n must be lower than len(l).

Parameters
  • silent – use a silent mode.

  • l – a range

  • n – number of equally spaced intervals.

Example

>>> from pygtftk.utils import  intervals
>>> l = range(10)
>>> a = intervals(l, 3)
>>> assert a == [(0, 3), (3, 6), (6, 10)]
pygtftk.utils.is_comment(string)

Check wether a string is a comment (starts with #…).

Parameters

string – The string to be tested.

Example

>>> from pygtftk.utils import is_comment
>>> assert is_comment("#bla")
pygtftk.utils.is_empty(string)

Is the string empty.

Parameters

string – The string to be tested.

Example

>>> from pygtftk.utils import is_empty
>>> assert is_empty("")
pygtftk.utils.is_exon(string)

Does the string contains ‘exon’ preceded or not by spaces.

Parameters

string – The string to be tested.

Example

>>> from pygtftk.utils import is_exon
>>> assert is_exon("Exon") and is_exon("exon")
pygtftk.utils.is_fasta_header(string)

Check if the line is a fasta header.

Parameters

string – a character string.

Example

>>> from pygtftk.utils import is_fasta_header
>>> assert is_fasta_header(">DFTDFTD")
pygtftk.utils.make_outdir_and_file(out_dir=None, alist=None, force=False)

Create output directory and a set of associated files (alist). Return a list of file handler (write mode) for the requested files.

Parameters
  • out_dir – the directory where file will be located.

  • alist – the list of requested file names.

  • force – if true, don’t abort if directory already exists.

Example

>>> from pygtftk.utils import make_outdir_and_file
pygtftk.utils.make_tmp_dir(prefix='tmp', store=True)

This function should be call to create a temporary file as all files declared in TMP_FILE_LIST will be remove upon command exit.

Parameters
  • prefix – a prefix for the temporary file (all of them will contain ‘pygtftk’).

  • store – declare the temporary file in utils.TMP_FILE_LIST. In pygtftk, the deletion of these files upon exit is controled through -k.

Example

>>> from pygtftk.utils import  make_tmp_dir
>>> import os
>>> import shutil
>>> tp_dir = make_tmp_dir()
>>> assert os.path.exists(tp_dir)
>>> assert os.path.isdir(tp_dir)
>>> shutil.rmtree(tp_dir)
pygtftk.utils.make_tmp_file(prefix='tmp', suffix='', store=True, dir=None)

This function should be call to create a temporary file as all files declared in TMP_FILE_LIST will be remove upon command exit.

Parameters
  • prefix – a prefix for the temporary file (all of them will contain ‘pygtftk’).

  • suffix – a suffix (default to ‘’).

  • store – declare the temporary file in utils.TMP_FILE_LIST. In pygtftk, the deletion of these files upon exit is controled through -k.

  • dir – a target directory.

Example

>>> from pygtftk.utils import make_tmp_file
>>> tmp_file = make_tmp_file()
>>> assert os.path.exists(tmp_file.name)
>>> tmp_file = make_tmp_file(prefix="pref")
>>> assert os.path.exists(tmp_file.name)
>>> tmp_file = make_tmp_file(suffix="suf")
>>> assert os.path.exists(tmp_file.name)
pygtftk.utils.median_comp(alist)

Compute the median from a list.

Parameters

alist – a list.

Example

>>> from pygtftk.utils import median_comp
>>> a = [10,20,30,40,50]
>>> assert median_comp(a) == 30
>>> a = [10,20,40,50]
>>> assert median_comp(a) == 30
pygtftk.utils.message(msg, nl=True, type='INFO', force=False)

Send a formated message on STDERR.

Parameters
  • msg – the message to be send.

  • nl – if True, add a newline.

  • type – Could be INFO, ERROR, WARNING.

  • force – Force message, whatever the verbosity level.

>>> from pygtftk.utils import message
pygtftk.utils.mkdir_p(path)

Create a directory silently if already exists.

Example

>>> from pygtftk.utils import check_file_or_dir_exists
>>> from pygtftk.utils import mkdir_p
>>> mkdir_p('/tmp/test_gtftk_mkdir_p')
>>> assert check_file_or_dir_exists('/tmp/test_gtftk_mkdir_p')
pygtftk.utils.nested_dict(n, type)

http://stackoverflow.com/questions/29348345

pygtftk.utils.random_string(n)

Returns a random string (alpha numeric) of length n.

pygtftk.utils.rnd_alpha_numeric_string(str_len=8)

Returns a random alphanumeric string of size str_len.

Parameters

str_len – The length of the output string.

:Example :

>>> from pygtftk.utils import rnd_alpha_numeric_string
>>> assert len(rnd_alpha_numeric_string(str_len=8)) == 8
pygtftk.utils.silentremove(filename)

Remove silently (without error and warning) a file.

Parameters

filename – a file name (or file object).

>>> from pygtftk.utils import silentremove
>>> from pygtftk.utils import make_tmp_file
>>> a = make_tmp_file()
>>> assert os.path.exists(a.name)
>>> silentremove(a.name)
>>> assert not os.path.exists(a.name)
pygtftk.utils.simple_line_count(afile)

Count the number of lines in a file.

Parameters

afile – A file object.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.utils import simple_line_count
>>> my_file = get_example_file(datasetname="simple", ext="gtf")
>>> my_file_h = open(my_file[0], "r")
>>> assert simple_line_count(my_file_h) == 70
pygtftk.utils.simple_nb_column(afile, separator='\t')

Count the maximum number of columns in a file.

Parameters
  • separator – the separator (default ” “).

  • afile – file name.

Example

>>> from pygtftk.utils import get_example_file
>>> from pygtftk.utils import simple_nb_column
>>> my_file = get_example_file(datasetname="simple", ext="gtf")
>>> my_file_h = open(my_file[0], "r")
>>> assert simple_nb_column(my_file_h) == 9
pygtftk.utils.sort_2_lists(list1, list2)

Sort list1 in increasing order and list2 occordingly.

Parameters
  • list1 – the first list.

  • list2 – the second list.

Example

>>> from pygtftk.utils import sort_2_lists
>>> a = ["c", "a", "b"]
>>> b = ["A", "B", "C"]
>>> c = sort_2_lists(a, b)
>>> assert c == [('a', 'b', 'c'), ('B', 'C', 'A')]
pygtftk.utils.tab_line(token=None, newline=False)

Returns a tabulated string from an input list with an optional newline

Parameters
  • token – an input list.

  • newline – if True a newline character is added to the string.

Example

>>> from pygtftk.utils import tab_line
>>> from pygtftk.utils import TAB
>>> assert tab_line(['a','b', 'c']) == 'a' + TAB + 'b' + TAB + 'c'
pygtftk.utils.to_alphanum(string)

Returns a new string in which non alphanumeric character have been replaced by ‘_’. If non alphanumeric characters are found at the beginning or end of the string, they are deleted.

Parameters

string – A character string in which non alphanumeric char have to be replaced.

:Example :

>>> from pygtftk.utils import to_alphanum
>>> assert to_alphanum("%gtf.bla") == 'gtf_bla'
pygtftk.utils.to_list(obj, split_char=None)

Convert a None, str, tuple to list. May also split if required.

pygtftk.utils.write_properly(a_string, afile)

Write a string to a file. If file is None, write string to stdout.

Parameters
  • a_string – a character string.

  • afile – a file object.

>>> from pygtftk.utils import write_properly