#!/usr/bin/perl
use strict;
use warnings;
use POSIX qw(floor);

# the first parameter is step size
# the second and third parameters are the SAM format files for the two conditions

my ($iclip,$step,$file1,$file2,$file_out,$min1,$min2)=@ARGV;
my ($chrstrand,$pos,$chr,$strand,%data,$id,$previous,$prt,$flag);

############  read SAM files  ##################

read_file(\%data,$file1,1,$step,$iclip);
read_file(\%data,$file2,2,$step,$iclip);

############  write intermediate files  ###########

$id=1; # id number for cluster
$prt="";
$flag=0;

open(FILE_OUT,">".$file_out);

foreach $chrstrand (keys %data)
{
  $previous=-2*$step;
  ($chr,$strand)=split("\t",$chrstrand);

  foreach $pos (sort {$a <=> $b} (keys %{$data{$chrstrand}}))
  {
    if ($pos-$previous>$step) # check if this bin is in a new cluster 
    {
      if ($flag==1) # print if the cluster has more than one cDNA count
      {
        print FILE_OUT $prt;
        $id++;
      } 
      $prt="";
      $flag=0;
    } 
    
    $prt.=$id."\t".$chr."\t".$pos."\t".$strand."\t"; # print id, chr, pos and strand
    $prt.=${$data{$chrstrand}->{$pos}}[0]; # all cDNA counts in one bin are collapsed to the first base
    map {$prt.="\t0";} (2..2*$step); # the other bases (including mutant tag counts) will be 0;
    $prt.="\t";
    $prt.=${$data{$chrstrand}->{$pos}}[1];
    map {$prt.="\t0";} (2..2*$step); 
    $prt.="\n";
    $previous=$pos;
    if (${$data{$chrstrand}->{$pos}}[0]>=$min1*$step || ${$data{$chrstrand}->{$pos}}[1]>=$min2*$step) {$flag=1;} # if the cluster is not a trivial one with only a few cDNA counts
  }
}

close(FILE_OUT);

###########  subroutines  #######################

sub read_file
{
  my ($data_ref,$file,$id,$step,$iclip)=@_;
  my $extend=$iclip; # extend to each side by $iclip for a cDNA count
  my (@items,$strand,$pos,$chrstrand,$i);

  open(FILE_IN,$file) or die "Can't open SAM format file ".$file."!\n";

  while (<FILE_IN>)
  {
    if ($_=~/^@/) {next;}
    @items=split("\t",$_);
    if ($items[1]!=0 && $items[1]!=16) {next;}
    
    if ($items[1]==0) {$strand="+";}
    else {$strand="-";}
    $chrstrand=$items[2]."\t".$strand;

    foreach $i ($items[3]-$extend..$items[3]+$extend)
    {
      if ($i<0) {next;}
      $pos=floor($i/$step)*$step;
      if (!exists $data_ref->{$chrstrand}) {$data_ref->{$chrstrand}={};} # the first layer is chr and strand
      if (!exists $data_ref->{$chrstrand}->{$pos}) {$data_ref->{$chrstrand}->{$pos}=[0,0];} # the second layer is position in nt
      ${$data_ref->{$chrstrand}->{$pos}}[$id-1]++; # the value is an array reference with two integers, corresponding to the cDNA counts in both conditions
    }
  }

  close(FILE_IN);
}


