-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathunmapped_seq2.py
38 lines (30 loc) · 1.04 KB
/
unmapped_seq2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
'''The script reads a list of sequences and prints out sequences not in a
list in FASTA format.
'''
import sys
from pygr import seqdb, sequtil
def main():
try:
input_file = sys.argv[1]
fasta_file = sys.argv[2]
except IndexError:
print >> sys.stderr, \
'unmapped_seq2.py <text file> <fasta file> [min length=50]'
try:
min_length = int(sys.argv[3])
except IndexError:
min_length = 50
db = seqdb.SequenceFileDB(fasta_file)
input_sequences = set()
print >> sys.stderr, 'Reading sequences...',
for line in open(input_file):
input_sequences.add(line.strip())
print >> sys.stderr, 'total %d' % len(input_sequences)
print >> sys.stderr, 'Writing unmapped sequences...'
for seq in db:
sequence = db[seq]
if (seq not in input_sequences and len(sequence) >= min_length):
print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence))
sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
if __name__=='__main__':
main()