# -*- coding: utf-8 -*-
import numpy as np
import os
import sys

def analyze_dssp_output(filename, structured_elements=None):
    """
    Analyze DSSP output file to calculate percentage of time each residue spends in structured elements.
    """
    # Default structured elements: helices and sheets
    if structured_elements is None:
        structured_elements = ['H', 'G', 'I', 'E', 'B']
    
    # Read and parse the file
    residues = []
    ss_data = []
    
    with open(filename, 'r') as f:
        for line in f:
            # Skip comment lines
            if line.startswith('#'):
                # Try to extract residue numbers from header
                if 'RES' in line:
                    parts = line.split()
                    residues = [p for p in parts if p.startswith('RES')]
                continue
            
            # Parse data lines
            parts = line.split()
            if len(parts) > 1:
                ss_data.append(parts[1:])
    
    # Convert to numpy array
    ss_array = np.array(ss_data)
    n_frames, n_residues = ss_array.shape
    
    # Calculate percentage of structured elements for each residue
    structured_percentage = []
    for i in range(n_residues):
        count_structured = np.sum([1 for ss in ss_array[:, i] if ss in structured_elements])
        percentage = (float(count_structured) / n_frames) * 100
        structured_percentage.append(percentage)
    
    # Create residue labels
    if not residues:
        residues = ["Res{}".format(i+1) for i in range(n_residues)]
    
    # Create a dictionary with results
    results = dict(zip(residues, structured_percentage))
    
    return results, structured_percentage, residues, n_frames

def save_results_to_file(results, percentages, residues, n_frames, output_file="dssp_analysis_results.txt"):
    """
    Save the analysis results to a text file.
    """
    with open(output_file, 'w') as f:
        f.write("DSSP Secondary Structure Analysis Results\n")
        f.write("==========================================\n\n")
        f.write("Total frames analyzed: {}\n".format(n_frames))
        f.write("Structured elements considered: H, G, I, E, B (helices and sheets)\n\n")
        
        f.write("Residue Structured Percentage:\n")
        f.write("-----------------------------\n")
        for res, perc in results.items():
            f.write("{}: {:.2f}%\n".format(res, perc))
        
        # Calculate and write overall statistics
        avg_structured = np.mean(percentages)
        std_structured = np.std(percentages)
        f.write("\nOverall statistics:\n")
        f.write("------------------\n")
        f.write("Average time in structured elements: {:.2f}%\n".format(avg_structured))
        f.write("Standard deviation: {:.2f}%\n".format(std_structured))
        
        # Identify most and least structured residues
        max_idx = np.argmax(percentages)
        min_idx = np.argmin(percentages)
        f.write("Most structured residue: {} ({:.2f}%)\n".format(residues[max_idx], percentages[max_idx]))
        f.write("Least structured residue: {} ({:.2f}%)\n".format(residues[min_idx], percentages[min_idx]))
        
        # Identify consistently structured regions
        f.write("\nConsistently structured regions (>80% structured):\n")
        f.write("-----------------------------------------------\n")
        threshold = 80
        in_region = False
        region_start = 0
        
        for i, perc in enumerate(percentages):
            residue_num = i + 1
            if perc >= threshold and not in_region:
                in_region = True
                region_start = residue_num
            elif perc < threshold and in_region:
                in_region = False
                region_end = residue_num - 1
                f.write("Residues {}-{}\n".format(region_start, region_end))
        
        # If we're still in a region at the end
        if in_region:
            f.write("Residues {}-{}\n".format(region_start, len(percentages)))
        
        # Save data for potential plotting with other tools
        f.write("\n\nData for plotting (residue number vs percentage):\n")
        f.write("-----------------------------------------------\n")
        f.write("Residue\tPercentage\n")
        for i, perc in enumerate(percentages):
            f.write("{}\t{:.2f}\n".format(i+1, perc))
    
    print "Results saved to {}".format(output_file)
    return output_file

# Main execution
if __name__ == "__main__":
    # Get filename from command line argument or use default
    if len(sys.argv) > 1:
        filename = sys.argv[1]
    else:
        filename = "dssp.dat"
        print "No filename provided, using default: dssp.dat"
    
    # Check if file exists
    if not os.path.isfile(filename):
        print "Error: File '{}' not found!".format(filename)
        print "Please provide a valid filename as an argument."
        print "Usage: python script.py [filename]"
        sys.exit(1)
    
    # Analyze the DSSP output file
    results, percentages, residues, n_frames = analyze_dssp_output(filename)
    
    # Print results to console
    print "Residue Structured Percentage:"
    for res, perc in results.items():
        print "{}: {:.2f}%".format(res, perc)
    
    # Calculate and print overall statistics
    avg_structured = np.mean(percentages)
    std_structured = np.std(percentages)
    print "\nOverall average time in structured elements: {:.2f}% (±{:.2f}%)".format(avg_structured, std_structured)
    
    # Identify most and least structured residues
    max_idx = np.argmax(percentages)
    min_idx = np.argmin(percentages)
    print "Most structured residue: {} ({:.2f}%)".format(residues[max_idx], percentages[max_idx])
    print "Least structured residue: {} ({:.2f}%)".format(residues[min_idx], percentages[min_idx])
    
    # Save detailed results to file
    output_file = save_results_to_file(results, percentages, residues, n_frames)
    
    # Additional analysis: Identify consistently structured regions
    print "\nConsistently structured regions (>80% structured):"
    threshold = 80
    in_region = False
    region_start = 0
    
    for i, perc in enumerate(percentages):
        residue_num = i + 1
        if perc >= threshold and not in_region:
            in_region = True
            region_start = residue_num
        elif perc < threshold and in_region:
            in_region = False
            region_end = residue_num - 1
            print "  Residues {}-{}".format(region_start, region_end)
    
    # If we're still in a region at the end
    if in_region:
        print "  Residues {}-{}".format(region_start, len(percentages))
    
    print "\nFor visualization, you can:"
    print "1. Import the data from {} into Excel or another spreadsheet program".format(output_file)
    print "2. Use the 'Data for plotting' section to create a line chart"
    print "3. Alternatively, install matplotlib with: pip install matplotlib"