#!/usr/bin/perl
use strict;
use warnings;
use List::Util qw/min max sum/;
$SIG{__WARN__}=sub{die "SAM file format error!\n"};

################  parameters  #############################

# suffix: suffix of paired-end reads, "F3,F5-RNA"
# file_tag: the alignment file
# file_merge: output file
# mutation: desired type of mutation

my ($suffix,$file_tag,$file_merge,$mutation,$fr)=@ARGV;
my (%fr,$i,$j,%tags,@items,$tag,$name,$md);
my ($l_ref,$r_ref,$l_len,$r_len,$len,@mut);
my ($new_pos,@new_CIGAR,@new_qual);
my ($suffix_f,$suffix_r);

if ($fr eq "fr-second") {%fr=("flag83"=>16,"flag99"=>0);}
if ($fr eq "fr-first") {%fr=("flag83"=>0,"flag99"=>16);}

################  main program  #############################

# compile regex
($suffix_f,$suffix_r)=split(",",$suffix);
$suffix_f='^(.*)'.$suffix_f.'$';
$suffix_r='^(.*)'.$suffix_r.'$';
$suffix_f=qr/$suffix_f/;
$suffix_r=qr/$suffix_r/;

# open file
open(FILE_IN,$file_tag) or die "read alignment file ".$file_tag." failed!\n";
open(FILE_OUT,">".$file_merge) or die "can not write to file ".$file_merge."!\n";

while (<FILE_IN>) # find mates
{
  # preliminary processing of reads
  if ($_=~/^@/) {next;}
  @items=split("\t",$_);
  unless ($items[1]==147 || $items[1]==163 || $items[1]==83 || $items[1]==99) {next;} # delete gapped mapping and keep only mapped reads
  if ($_=~/(MD\:Z\:.*?)[\t\n]/) {$md=$1;}
  else {$md=-1;} # get MD field  

  # put read under the right key in the hash
  # information stored: pos, CIGAR, seq, qual, MD
  if ($items[1]==83)
  {
    $items[0]=~/$suffix_f/;
    $name=$1."_".$items[2]."_".($items[3]+$items[7]);
    $tags{$name}->{"strand"}=$fr{"flag83"};
    $tags{$name}->{"chr"}=$items[2];
    $tags{$name}->{"f"}=[$items[3],$items[5],$items[9],$items[10],$md];
  }elsif ($items[1]==99)
  {
    $items[0]=~/$suffix_f/;
    $name=$1."_".$items[2]."_".($items[3]+$items[7]);
    $tags{$name}->{"strand"}=$fr{"flag99"};
    $tags{$name}->{"chr"}=$items[2];
    $tags{$name}->{"f"}=[$items[3],$items[5],$items[9],$items[10],$md];
  }elsif ($items[1]==147)
  {
    $items[0]=~/$suffix_r/;
    $name=$1."_".$items[2]."_".($items[3]+$items[7]);
    $tags{$name}->{"r"}=[$items[3],$items[5],$items[9],$items[10],$md];
  }else
  {
    $items[0]=~/$suffix_r/;
    $name=$1."_".$items[2]."_".($items[3]+$items[7]);
    $tags{$name}->{"r"}=[$items[3],$items[5],$items[9],$items[10],$md];
  }

  # check if the mates are already found,
  unless ((exists $tags{$name}->{"f"}) && (exists $tags{$name}->{"r"})) {next;}

  if ($tags{$name}->{"strand"}==0)
  {
    $l_ref=$tags{$name}->{"f"};
    $r_ref=$tags{$name}->{"r"};
  }else
  {
    $r_ref=$tags{$name}->{"f"};
    $l_ref=$tags{$name}->{"r"};
  }

  # calculate mate length and check distance
  # it's possible that one or both of the mates are cut into multiple segments during alignment. This will filter out most of such cases
  $l_len=0;
  $r_len=0;
  while ($$l_ref[1]=~/([0-9]+)([MD])/g) {$l_len+=$1;}
  while ($$r_ref[1]=~/([0-9]+)([MD])/g) {$r_len+=$1;}
  unless ($$l_ref[0]<=$$r_ref[0]+$r_len-1 && $$r_ref[0]<=$$l_ref[0]+$l_len-1 && $$l_ref[1]!~/N/ && $$r_ref[1]!~/N/) 
  {
    delete $tags{$name}; # mates that don't overlap or mates of which at least one span introns will be deleted
    next;
  }

  # write name, strand, chr, pos to output file
  $new_pos=min($$l_ref[0],$$r_ref[0]);
  print FILE_OUT $name."\t".$tags{$name}->{"strand"}."\t".$tags{$name}->{"chr"}."\t".$new_pos."\t*\t";
   
  # form fake CIGAR field
  # get counts and positions of desired type of mutation, write to the output file
  @new_CIGAR=("M") x (max($$l_ref[0]+$l_len,$$r_ref[0]+$r_len)-$new_pos);

  @mut=(); # left
  read_mut($mutation,$$l_ref[1],$$l_ref[2],$$l_ref[4],\@mut,$tags{$name}->{"strand"});  
  map {$new_CIGAR[$_+$$l_ref[0]-$new_pos-1]="D";} @mut;

  @mut=(); # right
  read_mut($mutation,$$r_ref[1],$$r_ref[2],$$r_ref[4],\@mut,$tags{$name}->{"strand"});
  map {$new_CIGAR[$_+$$r_ref[0]-$new_pos-1]="D";} @mut; 
 
  $i=0;
  $j=1;

  while ($i<=$#new_CIGAR)
  {  
    while ($j<=$#new_CIGAR && $new_CIGAR[$i] eq $new_CIGAR[$j]) {$j++;}
    print FILE_OUT ($j-$i).$new_CIGAR[$i];
    $i=$j;
    $j++;
  }

  print FILE_OUT "\t*\t*\t*\t*\t";
  
  # fake quality score
  @new_qual=(split(//,$$l_ref[3]),split(//,$$r_ref[3]));
  map {$_=ord($_);} @new_qual;
  print FILE_OUT chr(int(sum(@new_qual)/($#new_qual+1)))."\n";

  delete $tags{$name}; # delete the mates from the tag hash 
}

close(FILE_IN);
close(FILE_OUT);

###########  subroutines  ###############################

sub read_mut # read mutatant position
{
  my ($mut_type,$CIGAR,$seq,$MD,$mut_pos_ref,$strand)=@_;
  my $ref_pos=0;
  my $tag_pos=0;
  my ($regex,$match,$temp);

  if ($CIGAR=~/([0-9]+)S.*[0-9]+M/) {$seq=substr($seq,$1);} # offset soft-clipping
  $CIGAR=~s/[0-9]+H//g; # offset hard-clipping

  while ($CIGAR=~/([0-9]+)([MDI])/g)
  {
    if ($2 eq "M") 
    {
      $ref_pos+=$1;
      $tag_pos+=$1;
    }elsif ($2 eq "I")
    {
      {if ($mut_type=~/Ins/) {push @$mut_pos_ref,($ref_pos+1);}}
      substr($seq,$tag_pos,$1)="";
    }else
    { 
      {if ($mut_type=~/Del/) {push @$mut_pos_ref,($ref_pos+1);}}
      $ref_pos+=$1;
    }
  }  

  $ref_pos=0;
  $tag_pos=0;

  while ($MD=~/([0-9]+|[ACGTN]|\^[ACGTN]+)/g) 
  {
    $match=$1;
    if ($match=~/[0-9]+/) 
    {
      $ref_pos+=$match;
      $tag_pos+=$match;      
    }elsif ($match=~/^[ACGTN]$/)
    {
      $ref_pos+=1;
      $temp=substr($seq,$tag_pos,1);

      if ($strand==16)
      {
        $match=transform($match);
        $temp=transform($temp);
      }

      $temp=$match."2".$temp;
      $regex=qr/$temp/;
      $tag_pos+=1;
      if ($mut_type=~$regex) {push @$mut_pos_ref,$ref_pos;}
    }else
    {
      $ref_pos+=length($match)-1;
    }
  }
}

sub transform # negative strand to positive strand
{
  my $base=$_[0];
  
  if ($base eq "A")
  {
    $base="T";
  }elsif ($base eq "T")
  {
    $base="A";
  }elsif ($base eq "C")
  {
    $base="G";
  }elsif ($base eq "G")
  {
    $base="C";
  }

  return($base);
}

############  explanation of the algorithm  ####################

=head1

(1) flag of paired-end mates

  147 10010011 3' segment positive strand
  163 10100011 3' segment negtaive strand
  83  01010011 5' segment negative strand
  99  01100011 5' segment positive strand

(2) how mates are merged 

positive strand

      FFFFFFFF
           RRRRRRRRRR
      ---------------

      FFFFFFFFFFFFFFF
    RRRRRRRRRRRRRR
    -----------------

negative strand

      RRRRRRRR
             FFFFFFFFFF
      -----------------

           RRRRRRRR    
   FFFFFFFFFFFFFFFFFFFFFFFF
   ------------------------

(3) the format of the dummy SAM file

  Name: will be a modified as the original name plus chromosome and the sum of the starting positions of the mates
  Flag: will be 0 and 16 instead of 99, 147, 83, 163
  Pos: will the new starting position
  CIGAR: will only have Ms and Ds. Ds are the dummy mutations which mark the positions of the desired type of mutations in the original paired-end file
  Seq: will be *
  Qual: will only have one character, which is the average quality scores of the two mates
  MD: will be -1

=cut
