#!/usr/bin/env perl
#
# cat pfam | pfam.pl
#

use strict;
use warnings;

my $entry = "";
while (my $l = <STDIN>) {
  if ($l =~ /^#=GF AC +(\S+)/) {
    $entry = $1;
    $entry =~ s/\.\S+$//;
  }
  elsif ($l =~ /^#=GF RM +(\S+)/) {
    pmid($1);
  }
  elsif ($l =~ /^#=GF DR +([^\;]+)\; +([^\;]+)/) {
    dr($1,$2);
  }
  elsif ($l =~ /^#=GF CC +(\S+)/) {
    my $s = substr($l,9);
    while (my $t = <STDIN>) {
      if ($t =~ /^#=GF CC +/) {
        $s .= substr($t,9);
      }
      else {
        $l = $t;
        last;
      }
    }
    cc($s);
  }
  if ($l =~ /^#=GS /) {
    my $s = substr($l,5);
    gs($s);
  }
  if ($l =~ /^\/\//) {
    $entry = "";
  }
}

sub pmid {
  my $pmid = shift;
  print "pfam:$entry\tpubmed:$pmid\n";
}

sub dr {
  my ($db,$id) = @_;
  $db = lc $db;
  $id = $2;
  if ($db eq 'prosite') {
    print "pfam:$entry\tprosdoc:$id\n";
  }
  elsif ($db eq 'prosite_profile') {
    print "pfam:$entry\tprosite:$id\n";
  }
  elsif ($db eq 'mim') {
    print "pfam:$entry\tomim:$id\n";
  }
  elsif ($db eq 'cazy') {
    print "pfam:$entry\t${db}:$id\n";
  }
  elsif ($db eq 'tc') {
    print "pfam:$entry\ttcdb:$id\n";
  }
  elsif ($db eq 'interpro' || $db eq 'scop' || $db eq 'pdb') {
    print "pfam:$entry\t${db}:$id\n";
  }
}

sub cc {
  my $s = shift;
  $s =~ s/\s+/ /gm;

  my @n = ($s =~ /\d+\.\d+\.\d+\.\d+/g);
  foreach my $e (@n) {
    print "pfam:$entry\tec:$e\n";
  }
  @n = ($s =~ /PF\d{5}/g);
  foreach my $e (@n) {
    print "pfam:$entry\tpfam:$e\n";
  }
  @n = ($s =~ /Swiss:(\S+)/g);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (isuniprot($e)) {
      print "pfam:$entry\tuniprot:$e\n";
    }
  }
  @n = ($s =~ /Swiss\-?Prot:(\S+)/g);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (isuniprot($e)) {
      print "pfam:$entry\tuniprot:$e\n";
    }
  }
  @n = ($s =~ /UniProt:(\S+)/g);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (isuniprot($e)) {
      print "pfam:$entry\tuniprot:$e\n";
    }
  }
  @n = ($s =~ /UniProtKB:(\S+)/g);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (isuniprot($e)) {
      print "pfam:$entry\tuniprot:$e\n";
    }
  }
  @n = ($s =~ /PS\d{5}/g);
  foreach my $e (@n) {
    print "pfam:$entry\tprosite:$e\n";
  }
  @n = ($s =~ /PDOC\d{5}/g);
  foreach my $e (@n) {
    print "pfam:$entry\tprosdoc:$e\n";
  }
  @n = ($s =~ /MIM:(\d+)/g);
  foreach my $e (@n) {
    print "pfam:$entry\tomim:$e\n";
  }
  @n = ($s =~ /Genbank:(\S+)/g);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (isgenbank($e)) {
      print "pfam:$entry\tgenbank:$e\n";
    }
  }
  @n = ($s =~ /PDB:(\S+)/ig);
  foreach my $e (@n) {
    $e =~ s/[\.\,\)\:]//g;
    if (ispdb($e)) {
      print "pfam:$entry\tpdb:$e\n";
    }
  }
}

sub gs {
  my $s = shift;
  if ($s =~ /AC ([0-9A-Z]+)/) {
    my $id = $1;
    if (isuniprot($id)) {
      print "pfam:$entry\tuniprot:$id\n";
    }
  }
  if ($s =~ /DR PDB; +(\S+)/) {
    my $id = $1;
    if (ispdb($id)) {
      print "pfam:$entry\tpdb:$id\n";
    }
  }
}

sub isuniprot {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 5;
  return 0 if $l > 12;
  my @r = split //,$s;
  for (my $i = 0; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
  }
  return 0 if $c == $l;
  my $n = 0;
  for (my $i = 0; $i < $l; $i++){
    $n++ if $r[$i] =~ /\d/;
  }
  return 0 if $n == $l;
  return 1;
}

sub isgenbank {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub isembl {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub ispdb {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 4;
  return 0 if $l > 5;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /\d/;
  for (my $i = 1; $i < $l; $i++ ){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}
