#!/usr/bin/env perl
#
# cat epd.blk | epd.pl
#

use strict;
use warnings;

my $entry = "";
my $head = "";
while (my $l = <STDIN>) {
  if ($l =~ /^(\S+)/) {
    $head = $1;
  }
  if ($l =~ /^AC +(EP\d{5}+)/) {
    $entry = $1;
  }
  elsif ($l =~ /^RX +MEDLINE\; +(\d+)/) {
    if ($1 > 0) {
      print "epd:$entry\tpubmed:$1\n";
    }
  }
  elsif ($head eq "DR") {
    dr($l);
  }
  if ($l =~ /^\/\//) {
    $entry = "";
    $head = "";
  }
}

sub dr {
  my $s = shift;
  $s =~ /^DR +(\S+)\; ([^\;\.\/]+)/;
  my $db = $1;
  my $id = $2;
  if ($db eq "EMBL") {
    if (isembl($id)) {
      print "epd:$entry\tinsdc:$id\n";
    }
  }
  #elsif ($db eq "ENSEMBL") {
  #  print "epd:$entry\tensembl:$id\n";
  #}
  elsif ($db eq "EPD") {
    if ($id =~ /^EP\d{5}$/) {
      if ($id ne $entry) {
        print "epd:$entry\tepd:$id\n";
      }
    }
  }
  elsif ($db eq "FLYBASE") {
    print "epd:$entry\tflybase:$id\n";
  }
  #elsif ($db eq "GENOME") {
  #  print "epd:$entry\tncbi-???:$id\n";
  #}
  elsif ($db eq "MGD") {
    $id =~ /(\d+)/;
    print "epd:$entry\tmgd:$1\n";
  }
  elsif ($db eq "MIM") {
    print "epd:$entry\tomim:$id\n";
  }
  elsif ($db eq "RefSeq") {
    if (isrefseq_ac($id)) {
      print "epd:$entry\trefseq:$id\n";
    }
  }
  elsif ($db eq "SPTREMBL") {
    if (isuniprot($id)) {
      print "epd:$entry\tuniprot:$id\n";
    }
  }
  elsif ($db eq "SWISS-PROT") {
    if (isuniprot($id)) {
      print "epd:$entry\tuniprot:$id\n";
    }
  }
}

sub isuniprot {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 5;
  return 0 if $l > 12;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[0-9A-Z]/;
  for (my $i = 1; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z_]/;
  }
  my $c = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
  }
  return 0 if $c == $l;
  return 1;
}

sub isgenbank {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub isgenbank_locus {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 3;
  return 0 if $l > 11;
  my @r = split //,$s;
  for (my $i = 0; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}

sub isembl {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 6;
  return 0 if $l > 8;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[0-9A-Z]/;
  for (my $i = 2; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub isembl_locus {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 1;
  return 0 if $l > 10;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  for (my $i = 1; $i < $l; $i++){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}

sub isrefseq_ac {
  my $s = shift;
  my $l = length $s;
  return 0 if $l != 9;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /[A-Z]/;
  return 0 unless $r[1] =~ /[A-Z]/;
  return 0 unless $r[2] =~ /[0-9A-Z_]/;
  for (my $i = 3; $i < $l; $i++){
    return 0 unless $r[$i] =~ /\d/;
  }
  return 1;
}

sub ispdb {
  my $s = shift;
  my $l = length $s;
  return 0 if $l < 4;
  return 0 if $l > 5;
  my @r = split //,$s;
  return 0 unless $r[0] =~ /\d/;
  for (my $i = 1; $i < $l; $i++ ){
    return 0 unless $r[$i] =~ /[0-9A-Z]/;
  }
  my $c = 0;
  my $d = 0;
  for (my $i = 0; $i < $l; $i++){
    $c++ if $r[$i] =~ /[A-Z]/;
    $d++ if $r[$i] =~ /\d/;
  }
  return 0 if $c == $l;
  return 0 if $d == $l;
  return 1;
}
