66import copy
77from progressbar import ProgressBar
88
9- # import time
109import math
11- # import numpy as np
12- # import pybedtools
13-
1410
1511import seqcluster .libs .logger as mylog
1612from seqcluster .libs import utils
@@ -39,15 +35,10 @@ def _get_seqs_from_cluster(seqs, seen):
3935 already_in = set ()
4036 not_in = []
4137
42- already_in = map (seen .get , seqs )
38+ already_in = [ e for e in map (seen .get , seqs )]
4339 # if isinstance(already_in, list):
4440 already_in = filter (None , already_in )
4541 not_in = set (seqs ) - set (seen .keys ())
46- # for s in seqs:
47- # if s in seen:
48- # already_in.add(seen[s])
49- # else:
50- # not_in.append(s)
5142 return list (set (already_in )), list (not_in )
5243
5344
@@ -61,15 +52,21 @@ def reduceloci(clus_obj, path):
6152 large = 0
6253 current = clus_obj .clusid
6354 logger .info ("Number of loci: %s" % len (clus_obj .loci .keys ()))
55+ avg_loci_cluster = len (clus_obj .loci .keys ())/ len (clus_obj .seq )
56+ if avg_loci_cluster > 0.5 :
57+ logger .warning ("The avg number of loci by sequences is close to 0.5: %s" ,
58+ avg_loci_cluster )
59+ logger .warning ("This could mean you have sequences over the genome, rare in "
60+ "a typical small RNA data. This can cause errors during the execution "
61+ "or long computing time." )
6462 bar = ProgressBar (maxval = len (current )).start ()
6563 bar .update (0 )
6664 for itern , idmc in enumerate (current ):
6765 bar .update (itern )
6866 logger .debug ("_reduceloci: cluster %s" % idmc )
6967 c = copy .deepcopy (list (current [idmc ]))
70-
7168 n_loci = len (c )
72- if n_loci < 1000 :
69+ if n_loci < 100 :
7370 filtered , n_cluster = _iter_loci (c , clus_obj .clus , (clus_obj .loci , clus_obj .seq ), filtered , n_cluster )
7471 else :
7572 large += 1
@@ -98,7 +95,7 @@ def _write_cluster(metacluster, cluster, loci, idx, path):
9895 with open (out_file , 'w' ) as out_handle :
9996 for idc in metacluster :
10097 for idl in cluster [idc ].loci2seq :
101- pos = loci [idl ].list ()
98+ pos = [ e for e in loci [idl ].list ()]
10299 print ("\t " .join (pos [:4 ] + [str (len (cluster [idc ].loci2seq [idl ]))] + [pos [- 1 ]]), file = out_handle , end = "" )
103100
104101
@@ -136,7 +133,6 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
136133 n_loci = len (meta )
137134 n_loci_prev = n_loci + 1
138135 cicle = 0
139- # [logger.note("BEFORE %s %s %s" % (c.id, idl, len(c.loci2seq[idl]))) for idl in c.loci2seq]
140136 internal_cluster = {}
141137 if n_loci == 1 :
142138 n_cluster += 1
@@ -176,11 +172,11 @@ def _iter_loci(meta, clusters, s2p, filtered, n_cluster):
176172 return filtered , n_cluster
177173
178174
179- def _remove_loci (ci , idl ):
180- for (idl , lenl ) in locilen_sorted :
181- logger .debug ("_remove_loci:remove locus %s with len %s:" % (idl , lenl ))
182- c .loci2seq .pop (idl , "None" )
183- c .locilen .pop (idl , "None" )
175+ # def _remove_loci(ci, idl):
176+ # for (idl, lenl) in locilen_sorted:
177+ # logger.debug("_remove_loci:remove locus %s with len %s:" % (idl, lenl))
178+ # c.loci2seq.pop(idl, "None")
179+ # c.locilen.pop(idl, "None")
184180
185181
186182def _convert_to_clusters (c ):
@@ -512,7 +508,6 @@ def _merge_loci_in_cluster(c, new_c, idl, current_seqs):
512508def _merge_with_first_loci (c , new_c , first_idl , idl , current_seqs ):
513509 logger .debug ("_merge_with_first_loci:join first" )
514510 locus_seqs = c .loci2seq [idl ]
515- common = len (set (locus_seqs ).intersection (current_seqs ))
516511 seen = list (set (locus_seqs ).union (current_seqs ))
517512 new_c .add_id_member (list (locus_seqs ), first_idl )
518513 c .loci2seq .pop (idl , "None" )
@@ -522,7 +517,6 @@ def _merge_with_first_loci(c, new_c, first_idl, idl, current_seqs):
522517
523518def _remove_seqs_from_loci (c , idl , current_seqs ):
524519 current = c .loci2seq [idl ]
525- common = len (set (current ).intersection (current_seqs ))
526520 seen = list (set (current ).intersection (current_seqs ))
527521 unseen = list (set (sorted (current )).difference (sorted (seen )))
528522 logger .debug ("_remove_seqs_from_loci:seen %s unseen %s" % (len (seen ), len (unseen )))
0 commit comments