#!/usr/bin/env python

import sys, os

#####################################
# parse the location field
#####################################
def parseloc( loc):
    #loc=3R:6610646..6611879;
    #loc=3R:join(6610203..6610278,6610646..6611124);
    #loc=X:complement(174751..190386,190720..191488);
    
    # intron boundaries, indexed in chromosome
    bounds = []
    
    # end splice sites, indexed from 0 in current gene
    splices = []
    
    loc.strip()
    
    # multiple splices
    pos = loc.find("join(")
    if pos > 0 and loc.find("complement(") > 0:
        print "Found nested join, complement.  Ignoring"
        return splices

    if pos < 0:
        pos = loc.find("complement(")
    
    if pos > 0:
        # remove '(' and ')'
        loc = loc[ loc.find('(')+1 : loc.find(')') ]

        # split on ","
        introns = loc.split(',')
        #print introns
        for intron in introns:
            intron.strip()

            # begin, end bounds
            bounds.append( int(intron.split("..")[0] ))
            bounds.append( int(intron.split("..")[1] ))
            
    # single splice
    else:
        loc = loc[ loc.find(':')+1 : loc.find(';') ]
        
        # begin,end bounds
        bounds.append( int(loc.split("..")[0] ))
        bounds.append( int(loc.split("..")[1] ))
        

    # now re-index the splice sites, but only store the end
    if (len(bounds) % 2) > 0:
        print "bounds should have an even length. exiting"
        sys.exit(1)

    prevbound = 0
    length = len(bounds)
    i = 0
    while (i < length - 1):
        splices.append( (bounds[i+1] - bounds[i] + prevbound + 1))
        prevbound = splices[-1]
        i += 2
    return splices

    
#####################################
# Print segments of length length
# upstream of splice sites
#####################################
def printsplices( gname, seq, splices, length, fout):
    i = 0
    for splice in splices:
        print >>fout,">%s_%d" % (gname, i)
        if splice - length > 0:
            print >>fout, seq[ (splice - length) : splice]

        # print what we can
        else:
            print >>fout, seq[:splice]

        i += 1
        
    return


#####################################
# usage
#####################################
def usage():
    progname = sys.argv[0]
    msg = """Usage: %s <infasta> <outfasta> <# upstream nuc>

This script reads a set of introns, such as
'ftp://flybase.net/genomes/Drosophila_melanogaster/current/
fasta/dmel-all-intron-r4.2.1.fasta'.  It will split each intron at the
annotated boundaries, returning <x> upstream nucleotides of each
splice site.  Each intron name will be appended with a number of the
intron.  This indexing is arbitrary, so don't rely on it elsewhere."""

    print msg % progname


#####################################
# parseargs
#####################################
def parseargs():

    args = sys.argv[1:]
    if "-h" in args:
        usage()
        sys.exit(0)
    else:
        if len(args) != 3:
            m = "ERROR: parseargs: Require 3 arguments.  Given '%d'\n"
            sys.stderr.write( m % len(args))
            usage()
            sys.exit(1)
        
        # infile
        infile = args[0]
        if not os.path.exists( infile):
            m = "ERROR: parseargs: input file '%s' does not exist.\n"
            sys.stderr.write( m % infile)
            usagefn()
            sys.exit(1)

        outfile = args[1]

        upstream = int(args[2])

    return (infile, outfile, upstream)


#####################################
# main
#####################################
def main():

    (infile, outfile, upstream) = parseargs()

    fin = open( infile, 'r')
    fout = open( outfile, 'w')

    # current gene
    gname = ""
    splices = []
    seq = ""

    linenum = 1
    while 1:
        
        linenum += 1
        if linenum % 10000 == 0:
            print "line: %d" % linenum
        
        #>Rbp1-RC-in type=intron; loc=3R:6610646..6611879; name=Rbp1-RC-in; db_xref=FlyBase:FBgn0010252; release=r4.2.1; species=dmel; len=1234
        line = fin.readline().strip()
        #line = ">beat-IIIc-RA-in type=intron; loc=2L:complement(17186362..17186762,17187029..17189453,17189590..17192803,17193016..17247437,17247545..17256278); name=beat-IIIc-RA-in; db_xref=FlyBase:FBgn0032629; release=r4.2.1; species=dmel; len=69196"
        
        # EOF
        if line == '':
            printsplices( gname, seq, splices, upstream, fout)
            break

        # new gene
        if line[0] == '>':
            if gname and (len(splices) > 0):
                printsplices( gname, seq, splices, upstream, fout)

            seq = ""
            
            # gene name (split and take the first)
            gname = line[1:].split(' ', 1)[0]

            # find loc, parse with parseloc
            locpos = line.find("loc=")
            loc = line[ locpos : line.find(';', locpos) + 1]
            if loc < 0:
                print "loc not found! Exiting"
                sys.exit(1)
                
            # array of splice end positions
            splices = parseloc( loc)
            
        # parse the fasta format to single line
        else:
            seq += line


#####################################
# "main"
#####################################
if __name__ == "__main__":
    # set to 1 to enable profiling
    profile = 0

    if profile:
        import profile
        s = "main()"
        profile.run(s)
    else:
        main()

    sys.exit(0)




        
        

    


