#!/usr/bin/env perl
#
# cat pmd.dat | pmd.pl
#

use strict;
use warnings;

my $entry = "";
my $head = "";
while (my $l = <STDIN>) {
  if ($l =~ /^(\S+)/) {
    $head = $1;
  }
  if ($l =~ /^ENTRY +(\S+)/) {
    $entry = $1;
  }
  if ($head eq "MEDLINE") {
    unless ($l =~ /[\-\/_]/) {
      if ($l =~ /(\d+)/) {
        print "pmd:$entry\tpubmed:$1\n";
      }
    }
  }
  elsif ($head eq "CROSS-REFERENCE") {
    crossref(substr($l,16)) if length $l > 16;
  }
  elsif ($head eq "PROTEIN") {
    ec(substr($l,16)) if length $l > 16;
  }
  elsif ($head eq "SOURCE") {
    ec(substr($l,16)) if length $l > 16;
  }
  elsif ($l =~ /^ +#EC/) {
    ec($l);
  }
  if ($l =~ /^\/\/\//) {
    $entry = "";
    $head = "";
  }
}

sub crossref {
  my $s = shift;
  if ($s =~ /^#EC/) {
    ec($s);
    return;
  }
  my @r = split /\//,$s;
  foreach my $e (@r) {
    next unless $e =~ /\w/;
    $e =~ s/ +\(/\(/g;
    $e =~ s/ and /\&/;
    $e =~ s/ \+ /\&/;
    $e =~ s/[ ,]/\&/g;
    my @y = split /\&/,$e;
    foreach my $m (@y) {
      if ($m =~ /(\w+)/) {
        my $id = $1;
        if ($m =~ /\(genbank/i) {
          if (isgenbank_locus($id)) {
            print "pmd:$entry\tinsdc:$id\n";
          }
        }
        elsif ($m =~ /\(embl\)/i) {
          if (isembl($id)) {
            print "pmd:$entry\tinsdc:$id\n";
          }
        }
        elsif ($m =~ /\(pdb\)/i) {
          if (ispdb($id)) {
            print "pmd:$entry\tpdb:$id\n";
          }
        }
        elsif ($m =~ /\(prf\)/i) {
          next;
        }
        elsif ($id =~ /_/) {
          if (isuniprot($id)) {
            print "pmd:$entry\tuniprot:$id\n";
          }
          if ($m =~ /\((\w+_\w+)\)/) {
            if ($id ne $1) {
              $id = $1;
              if (isuniprot($id)) {
                print "pmd:$entry\tuniprot:$id\n";
              }
            }
          }
        }
        elsif (ispdb($id)) {
          print "pmd:$entry\tpdb:$id\n";
        }
        elsif ($m =~ /\((\w+_\w+)\)/) {
          $id = $1;
          if (isuniprot($id)) {
            print "pmd:$entry\tuniprot:$id\n";
          }
        }
      }
    }
  }
}

sub ec {
  my $s = shift;
  return if $s =~ /[1-6]\.\d+\.\d+\.\d+\./;
  my @r = ($s =~ /([1-6]\.\d+\.\d+\.\d+)/g);
  foreach my $e (@r) {
    print "pmd:$entry\tec:$e\n";
  }
}

sub isuniprot {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 5;
  return 0 if $l > 12;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[0-9A-Z]/;
  for (my $i = 1; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z_]/;
  }
  my $c = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
  }
  return 0 if $c == $l;
  return 1;
}

sub isgenbank {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub isgenbank_locus {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 3;
  return 0 if $l > 11;
  my @r = split //,$s;
  for (my $i = 0; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}

sub isembl {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub isembl_locus {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 1;
  return 0 if $l > 10;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  for (my $i = 1; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}

sub ispdb {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 4;
  return 0 if $l > 5;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /\d/;
  for (my $i = 1; $i < $l; $i++ ){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}
