use strict;

package InSilicoSpectro::Databanks::DBEntryUniprot;
require Exporter;
use Carp;

=head1 NAME

InSilicoSpectro::Databanks::DBEntryUniprot

=head1 SYNOPSIS


=head1 DESCRIPTION

Inherit from DBEntry, but can parse a uniprot format

=head1 FUNCTIONS

=head3 useInSilicoSpectro()

determine if InSilicoSpectro lib is used in the current environment. The main difference will be the tracnslation from uniprot MOD_RES nomenclature to InSilciSpectro one

=head1 METHODS


=head3 my $dbu=InSilicoSpectro::Databanks::DBEntryUniprot;

=head2 Accessors/Setters

=head3 $dbu->chains()

get a ref to an array of arrays for chain delimiter [from, to]

=head3 $dbu->add_chain([$from, $to])

set chain (through a reference a to an array)

=head3 $dbu->clear_chains()

Reset the chain arrays

=head3 $dbu->signals(); $dbu->add_signal([$from, $to]); $dbu->clear_signals()

Idem than for the *chain subs

=head2 Derived sequence generation

=head3 $dbu->generateIsoforms()

returns an array of InSilicoSpectro::Databanks::DBEntry containing all the isoforms generated by a swissprot entry

=head3 $dbu->generateChains()

Retuns an array of InSilicoSpectro::Databanks::DBEntry containing entries from FT CHAIN lines

=head3 $dbu->generatePeptides()

Retuns an array of InSilicoSpectro::Databanks::DBEntry containing entries from FT PEPTIDE lines

=head3 $dbu->generateDerivedForms()

Retuns an array of InSilicoSpectro::Databanks::DBEntry containing entries from the concatenation of the above methods

=head3 $dbu->seqSubstr(from=>int, to=>int [, subseq=>AAstring]);

=head3 $dbu->seqSubstr(pos=>int, len=>int [, subseq=>AAstring]);

Replace a piece of the sequence by a a subseq (or remove it if sebseq is unfdefined). All annotation will be updated (or remove if they inerfer with the substitued sequence.

from=>int notation starts sequence at position 1;

pos=>int notation starts sequence at position 0;

=head3 $dbu->seqExtract(from=>int, to=>int);

=head3 $dbu->seqExtract(pos=>int, len=>int);

Keep only the sub sequence described par the given delimiters (see seqSubstr(...) for description)

=head2 I/O

=head3 $dbe->readDat($fastacontent);

read info from fasta contents (fitrs line with '>' and info + remaining is sequence.

=head1 EXAMPLES

=head1 EXPORT

=head3 $VERBOSE

verbose level

=head1 SEE ALSO

=head1 COPYRIGHT

Copyright (C) 2004-2005  Geneva Bioinformatics www.genebio.com

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

=head1 AUTHORS

Alexandre Masselot, www.genebio.com

=cut

use InSilicoSpectro::Databanks::DBEntry;
our (@ISA, @EXPORT, @EXPORT_OK);
@ISA = qw(InSilicoSpectro::Databanks::DBEntry);

our $VERBOSE=0;

@EXPORT = qw(&useInSilicoSpectro $VERBOSE);
@EXPORT_OK = ();

use File::Basename;
use Class::Std;

use Data::Dumper;

our $isUsingInSilicoSpectro;

our @attrArrays=qw(chain signal peptide);
our $attrArraysStr=join '|', @attrArrays;
our $attrArraysRE=qr/\b($attrArraysStr)\b/;
our @attr=qw(chain signal);
our $attrStr=join '|', @attr;
our $attrRE=qr/\b($attrStr)\b/;

sub new{
  my ($pkg, $h)=@_;

  my $dvar = $pkg->SUPER::new();

  foreach (@attrArrays){
    $dvar->{$_}=[];
  }

  foreach (keys %{$h}){
    $dvar->{$_}= $h->{$_};
  }


  bless $dvar, $pkg;
  return $dvar;
}




sub AUTOMETHOD{
  my ($self, $ident, $val)=@_;
  my $set=exists $_[2];

  my $name=$_;
  if($name=~/add_($attrArraysStr)/){
    $name=$1;
    return sub {
      croak "must give a value to add to the array [$name]" unless defined $val;
      push @{$self->{$name}}, $val;
    };
    return sub {return $self->{$name}};
  }elsif($name=~/($attrArraysStr)s/){
    $name=$1;
    return sub {$self->{$name}=$val; return $val} if($set);
    return sub {return wantarray?@{$self->{$name}}:$self->{$name}};
  }elsif($name=~/clear_($attrArraysStr)s/){
    $name=$1;
    return sub {$self->{$name}=[]};
  }elsif( $name=~$attrRE){
    return sub {$self->{$name}=$val; return $val} if($set);
    return sub {return $self->{$name}};
  }else{
    return undef;
  }
}

################ Functions

sub useInSilicoSpectro{
  return $isUsingInSilicoSpectro if defined $isUsingInSilicoSpectro;
  eval{
    require InSilicoSpectro;
    InSilicoSpectro::init();
    $isUsingInSilicoSpectro=1;
  };
  if($@){
    warn "will not use InSilicoSpectro module & definitions";
    warn "$@";
    $isUsingInSilicoSpectro=0;
  }
  return $isUsingInSilicoSpectro;
}

# I/O
sub readDat{
  my $self=shift;
  my $dat=shift;
  my ($seq, $recSeq, $curFTLine, $recFT);

  $self->clear_chains();
  $self->clear_signals();
  $self->clear_annotatedModRes();
  $self->clear_variants();
  $self->{FTLines}={};
  my $acRead;
  my $descr;
  foreach(split /\n/, $dat){
    last if /^\/\//;
    undef $curFTLine if substr($_, 3, 10)=~/\S/;

    if ($recSeq){
      $seq.=$_;
      next;
    }
    if($curFTLine){
      croak "recording FT line is on and line does not match /^FT\\s+/" unless s/^FT\s+//;
      $curFTLine->{comment}.=" $_";
      next;
    }

    if(/^ID\s+(\w+)/){
      my $v=$1;
      $self->ID($v);
    }elsif(/^AC\s+(\w+)/ && ! $acRead){
      my $v=$1;
      $self->AC($v);
      $acRead=1;
    }elsif(/^DE\s+(.*)/){
      $descr.=" " if $descr;
      $descr.=$1;
    }elsif(/^OX\s+NCBI_TaxID=(\d+);/){
      my $v=$1;
      $self->ncbiTaxid($v);
    }elsif(/^FT\s+CHAIN\s+(\d+)\s+(\d+)/){
      my($to, $from)=($1, $2);
      $self->add_chain([$to, $from]);
    }elsif(/^FT\s+SIGNAL\s+(\d+)\s+(\d+)/){
      my($to, $from)=($1, $2);
      $self->add_signal([$to, $from]);
    }elsif(/^FT\s+PEPTIDE\s+(\d+)\s+(\d+)/){
      my($to, $from)=($1, $2);
      $self->add_peptide([$to, $from]);
    }elsif(/^FT\s+MOD_RES\s+(\d+)\s+(\d+)\s+(.*)/){
      if($2!=$1){
	carp "cannot handle multi-position FT MOD_RES: $_";
	next;
      }
      my ($p, $str)=($1, $3);
      if(useInSilicoSpectro){
	my $mr=InSilicoSpectro::InSilico::ModRes::getModifFromSprotFT($str);
	unless ($mr){
	  carp "cannot retrieve mod res from annotation [$str]" if $VERBOSE>=1;
	  next;
	}
	$self->add_annotatedModRes($p, $mr->get('name'));
      }else{
	$self->add_annotatedModRes($p, $str);
      }
    }elsif(/^FT\s+(VAR_SEQ|VARIANT)\s+(\d+)\s+(\d+)\s+(.*)/){
      my ($ft, $p1, $p2, $com)=($1, $2, $3, $4);
      $curFTLine={
		  from=>$2,
		  to=>$3,
		  comment=>$4
		 };
      push @{$self->{FTLines}{$1}}, $curFTLine;
    }elsif(/^SQ/){
      $recSeq=1;
    }
  }
  $self->sequence($seq);
  $self->description($descr);

  #rescan $self->{FTLines}
  #to put back $self->{seqSubstr} info together with isoform labels
  delete $self->{VAR_SEQ};
  $self->{seqSubstr}=[];
  if($self->{FTLines}{VAR_SEQ}){
    foreach my $ftl (@{$self->{FTLines}{VAR_SEQ}}){
      my @isof= $ftl->{comment}=~/(?<=isoform)\s+(\w+)/gi;
      carp "cannot parse VAR_SPLIC labels from [$ftl->{comment}] for ".$self->AC unless @isof;
      my $substr;
      if($ftl->{comment}=~/Missing/i){
	
      }elsif($ftl->{comment}=~/\w+\s*\->\s*([A-Z ]+)/){
	$substr=$1;
	$substr=~s/\s+//g;
      }else{
	carp "unparsable for Missing or subst FT VAR_SEQ comment: [$ftl->{comment}] for ".$self->AC;
      }
      my $h={
	     from=> $ftl->{from},
	     to=>$ftl->{to},
	     substr=>$substr,
	    };
      my $idx=scalar @{$self->{seqSubstr}};
      foreach(@isof){
	push @{$self->{VAR_SEQ}{$_}{seqSubstrIndex}}, $idx;
      }
      push @{$self->{seqSubstr}}, $h;
    }
    #reorder sub annotation to be in position decreasing order (to keep coherence)
    foreach (values %{$self->{VAR_SEQ}}){
      my @tmp=@{$_->{seqSubstrIndex}};
      @tmp=sort {$self->{seqSubstr}[$b]{from} <=> $self->{seqSubstr}[$a]{from}} @tmp;
      $_->{seqSubstrIndex}=\@tmp;
    }
    #@{$self->{seqSubstr}}=sort {$b->{from} <=> $a->{from}} @{$self->{seqSubstr}};
  }

  if($self->{FTLines}{VARIANT}){
    foreach my $ftl (@{$self->{FTLines}{VARIANT}}){
      if($ftl->{comment}=~/([A-Z]+)\s+\->\s+([\*A-Z]+)/){
	$self->add_variant($ftl->{from}, $1, $2);
      }elsif($ftl->{comment}=~/Missing/i){
	my $tmp='';
	foreach ($ftl->{from}..$ftl->{to}){
	  $tmp.='.';
	}
	$self->add_variant($ftl->{from}, $tmp, '');
      }else{
	carp "cannot parse VARIANT info from [$ftl->{comment}] for ".$self->AC;
      }
    }
  }

  undef $self->{FTLines};
}


################### Derived sequence generation ################3

sub generateDerivedForms{
  my $self=shift;

  my @ret;
  my @tmp=$self->generateChains();
  push @ret, @tmp;
  @tmp=$self->generateIsoforms();
  push @ret, @tmp;
  @tmp=$self->generatePeptides();
  push @ret, @tmp;
  return @ret;
}

sub generateIsoforms{
  my $self=shift;
  my @isoforms;

  foreach my $isoform (sort keys %{$self->{VAR_SEQ}}){
    my $isoseq=InSilicoSpectro::Databanks::DBEntryUniprot->new($self);
    $isoseq->AC($self->AC()."-$isoform");
    $isoseq->ACorig($self->AC());
    $isoseq->description($self->description()." [ISOFORM $isoform])");


    delete $isoseq->{VAR_SEQ};
    #duplicate seqSubstr not to alter the original sequence
    delete $isoseq->{seqSubstr};
    if($self->{seqSubstr}){
      foreach(@{$self->{seqSubstr}}){
	my %h=%$_;
	push @{$isoseq->{seqSubstr}}, \%h;
      }
    }

#    #remove seq outside the chain
#    if($isoseq->chain){
#      if($isoseq->chain()->[0]>1){
#	$isoseq->seqSubstr(from=>1, to=>$isoseq->chain()->[0]-1);
#      }
#    }
    foreach my $idx(@{$self->{VAR_SEQ}{$isoform}{seqSubstrIndex}}){
      my %h=%{$isoseq->{seqSubstr}[$idx]};
      $isoseq->seqSubstr(from=> $h{from}, to=>$h{to}, substr=>$h{substr});
    }

    #bless up to InSilicoSpectro::Databanks::DBEntry
    bless $isoseq, "InSilicoSpectro::Databanks::DBEntry";
    push @isoforms, $isoseq;
  }
  return @isoforms;
}

sub generatePeptides{
  my $self=shift;
  my @peptides;
  my $i=0;
  foreach my $pp (@{$self->peptides()}){
    my $pseq=InSilicoSpectro::Databanks::DBEntryUniprot->new($self);
    $pseq->description($self->description()." [PEPTIDE $i])");
    $pseq->ACorig($self->AC());
    $pseq->AC($self->AC()."_PEPT_$i");
    $pseq->seqExtract(from=>$pp->[0], to=>$pp->[1]);
    $i++;
    push @peptides, $pseq;
  }
  return @peptides;
}


sub generateChains{
  my $self=shift;
  my @chains;
  my $i=0;
  foreach my $c (@{$self->chains()}){
    my $cseq=InSilicoSpectro::Databanks::DBEntryUniprot->new($self);
    $cseq->ACorig($self->AC());
    $cseq->AC($self->AC()."_CHAIN_$i");
    $cseq->description($self->description()." [CHAIN $i])");
    $cseq->seqExtract(from=>$c->[0], to=>$c->[1]);
    $i++;
    push @chains, $cseq;
  }
  return @chains;
}


sub seqSubstr{
  my $self=shift;
  my %hprm=@_;
  my ($pos, $len, $substr);
  if($hprm{from} && $hprm{to}){
    $pos=$hprm{from}-1;
    $len=$hprm{to}-$hprm{from}+1;
  }elsif((defined $hprm{pos}) && $hprm{len}){
    ($pos, $len)=($hprm{pos}, $hprm{len});
  }else{
    croak "cannot DBEntryUniprot::seqSubstr with paramer [@_] (either (from=>x, to=>y) or (pos=>x, len=>y) for ".$self->AC;
  }
  $substr=$hprm{substr};
  my $seq=$self->sequence;
  $pos|=0;
  $seq=~s/(.{$pos}).{$len}/$1$substr/;
  $self->sequence($seq);
  $self->updateAnnotPos($pos, $len);
}


sub seqExtract{
  my $self=shift;
  my %hprm=@_;
  my ($pos, $len, $substr);
  if($hprm{from} && $hprm{to}){
    $pos=$hprm{from}-1;
    $len=$hprm{to}-$hprm{from}+1;
  }elsif((defined $hprm{pos}) && $hprm{len}){
    ($pos, $len)=($hprm{pos}, $hprm{len});
  }else{
    croak "cannot DBEntryUniprot::seqSubstr with paramer [@_] (either (from=>x, to=>y) or (pos=>x, len=>y) for ".$self->AC;
  }
  my $lseq=length $self->sequence();
  if($lseq>($pos+$len)){
    $self->seqSubstr(from=>$pos+$len+1, to=>$lseq);
  }
  if($pos>0){
    $self->seqSubstr(from=>1, to =>$pos);
  }
}

sub updateAnnotPos{
  my ($self, $pos, $len)=@_;
  #shift or remove all annotated PTM
  my @amr=$self->annotatedModRes;
  if(@amr){
    $self->clear_annotatedModRes;
    foreach (@amr){
      my @tmp=@$_;
      my $p=$tmp[0];
      if($p>=$pos){
	$tmp[0]-=$len;
	$self->add_annotatedModRes(@tmp) if $tmp[0]>0;
      }else{
	$self->add_annotatedModRes(@tmp);
      }
    }
  }
  @amr=$self->variants;
  if(@amr){
    $self->clear_variants;
    foreach (@amr){
      my @tmp=@$_;
      my $p=$tmp[0];
      if($p>=$pos){
	$tmp[0]-=$len;
	$self->add_variant(@tmp) if $tmp[0]>0;
      }else{
	$self->add_variant(@tmp);
      }
    }
  }
}


return 1;
