#!/usr/local/bin/python3

import re
from collections import OrderedDict

dirNames = "000dirNames.txt"
taxonSamplingList = "000taxonSampling.txt"

def make_list_dirs(dirNames):
    f = open(dirNames)
    lines_species_color = list(f)
    f.close()

    dirs_tmp = []
    for line in lines_species_color:
        line = line.rstrip("\n")
        dirs_tmp.append(line)
    return dirs_tmp

def make_list_species(taxonSamplingList):
    f = open(taxonSamplingList)
    lines_species_color = list(f)
    f.close()

    list_species_tmp = []
    for line in lines_species_color:
        matches = line.split("_")
        species = matches[0]
        color = matches[1]
        list_species_tmp.append(species)
    return list_species_tmp


def read_recs_num_blastHits(dirs, htmlFileName):
    recs_result = {}
    for dir in dirs:
        f = open(dir + "/" + htmlFileName)
        lines = list(f)
        flag = 0
        recs_result_each = {}

        lines_selected = []        
        for line in lines:
            if line.startswith("Alignment of blast hits:"):
                break
            if flag == 1:
                if not line.startswith("<"):
                    continue
                lines_selected.append(line)
            if line.startswith("# of blast hits:"):
                flag = 1

        for species in list_species:
            for line in lines_selected:
                if ">" + species + "_" in line:
                    match = re.search(">[^_]+_<.*:(\d+)$", line)
                    hits = match.group(1)
                    recs_result_each[species] = hits

        recs_result[dir] = recs_result_each

    return recs_result


def read_recs_dir_length(dirs, htmlFileName):
    recs_dir_length_fn = OrderedDict()
    for dir in dirs:
        f = open(dir + "/" + htmlFileName)
        lines = list(f)
        flag = 0
        recs_result_each = {}

        lines_selected = []        
        for line in lines:
            if line.startswith("Human SNP in dbSNP:"):
                break
            if flag == 1:
                lines_selected.append(line)
            if line.startswith("Query sequences:"):
                flag = 1
         
        recs_fn = readFastaList_dict(lines_selected)
        length_query = len((list(recs_fn.values())[0]))
        recs_dir_length_fn[dir] = length_query

    return recs_dir_length_fn


'''
def read_recs_positions_blastHits(dirs,fastaFileName):
    recs_result = {}
    for dir in dirs:
        #print(dir)
        recs_fasta = readFasta_dict(dir + "/" + fastaFileName)
        #recs_fasta = readPhy_dict(dir + "/" + fastaFileName)
        
        recs_result_each = {}
        for species in list_species:
            list_tmp = []
            for name in recs_fasta.keys():
                if ">" + species + "_" in name:
                #if re.search("^" + species + "_", name):
                    #print("name", name)
                    match = re.search("_([^-_]+)-([^-]+)-([^-_]+)_[^_]+_[^_]+$", name)
                    chromosome = match.group(1)
                    start = match.group(2)
                    end = match.group(3)
                    #print("chromosome", chromosome)
                    #print("start", start)
                    #print("end", end)
                    #exit()
                    if start.startswith("c"):
                        startTMP = start[1:]
                        start = end
                        end = startTMP
                    list_tmp.append([chromosome, start, end])
            #print(list_tmp)
            #exit()
            recs_result_each[species] = list_tmp
        recs_result[dir] = recs_result_each

    return recs_result
'''

def read_recs_positions_html_blastHits(dirs, htmlFileName):
    recs_result = {}
    for dir in dirs:
        #print(dir)
        lines_html_tmp = open(dir + "/" + htmlFileName)
        lines_html = list(lines_html_tmp)
        lines_html_tmp.close()

        flag = 0
        lines_selected = []        
        for line in lines_html:
            if line.startswith("Used4treeSearch"):
                if flag == 1:
                    break
                if flag == 0:
                    flag = 1
                    continue
            if flag == 1:
                line = line.rstrip("\n")
                lines_selected.append(line)

        #for line in lines_selected:
        #    print(line)
        #exit()

        recs_result_each = {}
        for species in list_species:
            #print(species)
            list_tmp = []
            for line in lines_selected:
                if ">" + species + "_" in line:
                    #print("line", line)
                    #print("name", species)
                    #exit()
                    match = ""
                    if re.search('target="_blank">([^>-]+)-([^-]+)-([^<-]+)<', line):
                        match = re.search('target="_blank">([^>-]+)-([^-]+)-([^<-]+)<', line)
                    elif re.search('_([^_-]+)-([^-]+)-([^<-]+)_[^_]+_[^_]+</font', line):
                        match = re.search('_([^_-]+)-([^-]+)-([^-_]+)_[^_]+_[^_]+</font', line)
                    else:
                        print("Error.")
                        print("Check your alignment.html file")
                        print("Cannot identify coordinate from sequence name line.")
                        exit()
                    chromosome = match.group(1)
                    start = match.group(2)
                    end = match.group(3)
                    #print("chromosome", chromosome)
                    #print("start", start)
                    #print("end", end)
                    #print("")
                    #exit()
                    if start.startswith("c"):
                        startTMP = start[1:]
                        start = end
                        end = startTMP
                    list_tmp.append([chromosome, start, end])
            #print(list_tmp)
            #exit()
            recs_result_each[species] = list_tmp
        recs_result[dir] = recs_result_each

    return recs_result

def readPhy_dict(InfileNameFN):
    Infile = open(InfileNameFN, "r")
    seqDictFN  = OrderedDict()
    for Line in Infile:
        Line = Line.rstrip("\n")
        if re.search("^\d", Line):
            continue
        else:
            #print("Line", Line)
            match = re.search("([^ ]+) +([^ ]+)$", Line)
            Name = match.group(1)
            seq = match.group(2)
            seqDictFN[Name] = seq
    Infile.close()
    return seqDictFN

def readFastaList_dict(lines_fn):
    seqDictFN  = OrderedDict()
    for line in lines_fn:
        line = line.rstrip("\n")
        if len(line) < 1:
            continue
        if line[0] == ">":
            Name            = line
            seqDictFN[Name] = ""
        else:
            line = re.sub("<[^>]+>", "", line)
            seqDictFN[Name] += line
    return seqDictFN

def readFasta_dict(InfileNameFN):
    Infile = open(InfileNameFN, "r")
    seqDictFN  = OrderedDict()
    for Line in Infile:
        Line = Line.rstrip("\n")
        if Line[0] == ">":
            Name            = Line
            seqDictFN[Name] = ""
        else:
            seqDictFN[Name] += Line
    Infile.close()
    return seqDictFN



################################################################################# 

dirs = make_list_dirs(dirNames)
list_species = make_list_species(taxonSamplingList)

html_resultfile = "010_alignment.html"

#### Number of blast hits list
#'''
recs_num_blastHits = read_recs_num_blastHits(dirs, htmlFileName = html_resultfile)
recs_dir_length = read_recs_dir_length(dirs, htmlFileName = html_resultfile)


out=open("020out_num_blasthits.txt", "w")
out.write("species\t")
for dir in dirs:
    out.write(dir + "\t")
out.write("\n")
out.write("query_length\t")
for dir, length_querySeq in recs_dir_length.items():
    out.write(str(length_querySeq) + "\t")
out.write("\n")
for species in list_species:
    out.write (species + "\t")
    for dir in dirs:
        out.write(recs_num_blastHits[dir][species] + "\t")
    out.write("\n")

#exit()
#'''

######
#recs_positions_blastHits= read_recs_positions_blastHits(dirs, fastaFileName = "010_aln_nucl.txt")
recs_positions_blastHits= read_recs_positions_html_blastHits(dirs, htmlFileName = html_resultfile)
out2 = open("020out_pos_blasthits.txt", "w")
for species in list_species:
    out2.write (species + "\n")
    for dir in dirs:
        for positions in recs_positions_blastHits[dir][species]:
            out2.write(dir + "\t")
            for ele in positions:
                out2.write(ele + "\t")
            out2.write("\n")
    out2.write("\n")
out2.write("\n")


exit()



