function [mfreq, tfreq, mis, zs, dat, agg, ureg, treg, ustrn] = processdata( clfath, ncfath )
% Retrieve variables from aligned reads.

% load data sets
[chr, reg, spos, epos, mut, strn, rlen, tfreq, mfreq, trd, ureg, treg] = loadreads(clfath);
[~, ~, ~, ~, ~, ~, ~, mis.tfreq, mis.mfreq, ~, ~, ~] = loadreads(ncfath);

fprintf('Data sets have been loaded. \n');

[ mfreq, mis.mfreq, tfreq, mis.tfreq ] = equallen( mfreq, mis.mfreq, tfreq, mis.tfreq );
mfreq{1} = mfreq{1} + mis.mfreq{1};
mfreq{2} = mfreq{2} + mis.mfreq{2};
tfreq = tfreq + mis.tfreq;

mrlen = length(tfreq);
for k = 1:2
    for i = 1:mrlen
        zs{k,i} = logical(~mfreq{k}(i,1:i));
    end
end

ustrn = nominal({'-','+'});
%unique choromosomes. ordered
uchr = nominal({'chr1', 'chr2', 'chr3', 'chr4', ...
    'chr5', 'chr6', 'chr7', 'chr8', ...
    'chr9', 'chr10', 'chr11', 'chr12', ...
    'chr13', 'chr14', 'chr15', 'chr16', ...
    'chr17', 'chr18', 'chr19', 'chrM', ...
    'chrX', 'chrY'});
tchr = length(uchr); % total unique chromosomes


% cell array
% each row cell contains {dchr, dloc, dmut, dpos, dreg, drlen, dstrn, dzs};
% for each read decomposed.
cdat = cell(trd,8);

%this loop takes ~ 5mins.
parfor i = 1:trd
    cdat(i,:) = dcpread( chr(i), uchr, reg(i), spos(i), epos(i), strn(i), ustrn, ...
        rlen(i), mut(i,:), zs);
end

fprintf('Decomposition of reads has been finished. \n');

% convert the cell arrays cdat{:,i} to a matrix.
dat.chr = cat(1,cdat{:,1});
dat.loc = cat(1,cdat{:,2});
dat.mut = cat(1,cdat{:,3});
dat.pos = cat(1,cdat{:,4});
dat.slc = zeros(size(dat.loc),class(dat.loc));
dat.reg = cat(1,cdat{:,5});
dat.rlen = cat(1,cdat{:,6});
dat.strn = cat(1,cdat{:,7});
dat.zs = cat(1,cdat{:,8});

% assign strand specific + choromosome specific genomic locations (slc)
% this is for coding efficiency. no real meaning.
temp = int32(0);
for i = 1:tchr
    subset = (~dat.strn & dat.chr == i);
    if any(subset)
        [~, ~, tmp3] = unique(dat.loc(~dat.strn & dat.chr == i),'stable');
        dat.slc(subset) = int32(tmp3) + temp;
        temp = max(dat.slc(subset));
    end
end

for i = 1:tchr
    subset = (dat.strn & dat.chr == i);
    if any(subset)
        [~, ~, tmp3] = unique(dat.loc(subset),'stable');
        dat.slc(subset) = int32(tmp3) + temp;
        temp = max(dat.slc(subset));
    end
end

% sort all fields in dat by slc
[dat.slc, sidx] = sort(dat.slc);
dat.chr = dat.chr(sidx);
dat.loc = dat.loc(sidx);
dat.mut = dat.mut(sidx,:);
dat.pos = dat.pos(sidx);
dat.reg = dat.reg(sidx);
dat.rlen = dat.rlen(sidx);
dat.strn = dat.strn(sidx);
dat.zs = dat.zs(sidx,:);

aggdata = cell(tchr,8);
cmut = cell(tchr,1);
cstrn = cell(tchr,1);

parfor i = 1:tchr
    cmut{i} =  mut(chr == uchr(i))
    cstrn{i} = strn(chr == uchr(i));
end

parfor i = 1:tchr
    aggdata(i,:) = aggread(uchr(i), dat.reg(dat.chr == i), dat.loc(dat.chr == i), ...
        dat.slc(dat.chr == i), cmut{i}, cstrn{i}, dat.strn(dat.chr == i), ...
        ustrn, dat.zs(dat.chr == i));
end

fprintf('Agreegation of reads has been finished. \n');

agg.chr = cat(1,aggdata{:,1});
agg.loc = cat(1,aggdata{:,2});
agg.mut = cat(1,aggdata{:,3});
agg.rd = cat(1,aggdata{:,4});
agg.reg = cat(1,aggdata{:,5});
agg.slc = cat(1,aggdata{:,6});
agg.strn = cat(1,aggdata{:,7});
agg.zs = cat(1,aggdata{:,8});
end

% sub function to impoert a data set in txt file format.
function [chr, reg, spos, epos, mut, strn, rlen, tfreq, mfreq, trd, ureg, treg] = loadreads( filepath )
% load aligned read sequences

% we set strn = '-' to be TRUE, and strn ='+' to be FALSE.
[reg,~,chr,spos,epos,strn,strmut] = textread(filepath,'%u %s %s %u %u %s %s','delimiter','\t');

reg = int32(reg); %region id
chr = nominal(chr); % chromosome
spos = int32(spos); % start locations of reads
epos = int32(epos); % end locations of reads
strn = nominal(strn); % strand
trd = length(reg); % total read number
ureg = unique(reg); % unique region id
treg = length(ureg); % total number of unique region ids

numut = cell(1,trd);
rlen = int8(epos - spos + 1);

for j = 1:trd
    numut{j} = str2num(strmut{j});
end

nm = cellfun(@length,numut);
mm = max(nm);

mut = zeros(trd,mm);
for j = 1:mm
    mut(nm == j,1:j) =  reshape([(numut{nm == j})],j,sum(nm == j))';
end
mut = int32(mut);

mpos = mut - repmat(spos,1,mm) + 1;
mpos(strn == '-',:) = repmat(int32(rlen(strn == '-')),1,mm) - mpos(strn == '-',:) + 1;

% strand specific
% 1: negative, 2: positive
% or false: negative, true: positive
ustrn = nominal({'-','+'});
mrlen = max(rlen);
tfreq = int32(zeros(mrlen,2));

for i = 1:mrlen
    tfreq(i,:)= [sum(rlen==i & strn == ustrn(1)), sum(rlen==i & strn == ustrn(2))];
end

temp = int32(zeros(mrlen,mrlen));

% frequency of mutations in each read length and position combination
% strand specific
mfreq = cell(2,1);
mfreq{1} = temp; %negative strand
mfreq{2} = temp; %positive strand

for k = 1:2
    for j = 1:mrlen
        mfreq{k}(j,1:j) = sum(histc(mpos(rlen == j & strn == ustrn(k),:), 1:j),2);
    end
end

end

% function to retrieve variables from read sequences
function crray = dcpread( chr, uchr, reg, spos, epos, strn, ustrn, rlen, mut, zs)
% compute variables for mixture models (r_{t,j},g_t, so on)
dmut = false(rlen,1);
dloc = (spos:epos)';
drlen = rlen .* ones(rlen,1,'int8');
dchr = int8(find(uchr == chr,1,'first')) .* ones(rlen,1,'int8');
dreg = reg .* ones(rlen,1,'int32');
dstrn = logical((ustrn(2) == strn) .* true(rlen,1)); % true .* true = double. wtf!

if (strn == ustrn(2)) % strn = '+'
    dpos = (1:rlen)';
    %     dzs = reshape(cell2mat(zs(2,rlen,:)),rlen,3);
    dzs = cat(1,zs{2,rlen})';
else
    dpos = (rlen:-1:1)';
    %     dzs = reshape(cell2mat(zs(1,rlen,:)),rlen,3);
    dzs = flipud(cat(1,zs{1,rlen})'); % pos fliped. zs should be fliped too.
end

dmut(ismember(dloc,mut),1) = 1;

crray = {dchr, dloc, dmut, dpos, dreg, drlen, dstrn, dzs};

end

% function to compute clustered quantities in each location.
function crray = aggread(chr, reg, loc, slc, cmut, cstrn, strn, ustrn, zs )
% compute quantities (read, mutation counts, etc.) at each location from aligned reads
%   Detailed explanation goes here
[dat.slc, idx] = unique(slc);
dat.reg = reg(idx);
dat.loc = loc(idx);
dat.chr = repmat(chr,length(idx),1);
dat.strn = logical(strn(idx));

m = length(slc);
n = length(dat.slc);
dat.mut = zeros(n,3,'int16');

dat.read = int16(histc(slc,dat.slc));
czs = mat2cell(zs,m,1);
dat.zs = cellfun(@(x) histc(slc(x),dat.slc), czs,'UniformOutput',false);
dat.zs = int16(cat(2,dat.zs{:}));

for k = 1:2
    pmut = nonzeros(cmut(cstrn == ustrn(k),:));
    umut = unique(pmut);
    freq = int16(histc(pmut,umut));
    dat.mut(dat.strn == (k-1) & ismember(dat.loc,umut)) = freq;
    
    if ~issorted(dat.loc(dat.strn == (k-1)))
        error('Locations are not sorted.');
    end
end

crray = {dat.chr, dat.loc, dat.mut, dat.read, ...
    dat.reg, dat.slc, dat.strn, dat.zs};

end


