[bionet.software] Mailfasta

deboer@balaena.bio.vu.nl (Thon de Boer) (07/17/90)
I wrote a program which uses the services offered by GenBank and EMBL for 
doing FASTA searches. It will send a sequence to a mail-server which will do a
sequence similarity search against the specified databases using the FASTA
program written by William Pearson and David Lipman.
The program runs interactive and is easy to use.
It can read files in the Pearson format, DNA strider files or plain DNA and
PROTEIN files with no comment lines.
For more information about the FASTA mail servers you can contact GenBank 
or EMBL. (send "HELP" in the subject field of a mail message to
SEARCH@GENBANK.BIO.NET or FASTA@EMBL (bitnet/EARN etc))
or FASTA@EMBL (bitnet/EARN etc))

Hope you can use it

-------------------------------------------------------------------------------
 _______                 ___           Thon de Boer
    /                   /__/           Dept. of Microbiological physiology
   //   __ __  _   __  /  \ __ __ __   Biological Faculty
  //-- / // / / \ /_/ /   // //_//  '  Vrije Universiteit, Amsterdam, Holland
 //  //_// / /__//_  /___//_//_ /      deboer@bio.vu.nl
-------------------------------------------------------------------------------
#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create:
#       mailfasta
#       cid
#       getentry
#       mailfasta.doc
#    in directory mf

cat << \SHAR_EOF > 'mailfasta'
#/bin/csh
echo ' '
echo ' '
echo '                                 MailFasta'
echo '                                 *********'
echo By Thon de Boer
echo Department of Microbiological Fysiology
echo Vrije universiteit AMSTERDAM  Holland.
echo This program is in the Public Domain
echo 'Send comments to deboer@bio.vu.nl (email).'
echo ' '
echo Use \'mailfasta -q\' to get the queue from the fasta mail server
echo " or 'mailfasta [sequencefile1 sequencefile2 ..]'"
echo ' '
echo This program will read a sequence file and mail it to
echo 'GenBank (or EMBL) were it will be scanned against the sequence databases'
echo 'using the FASTA program'
echo ' '
echo ' '
set stop = false
while ($stop == false)
if ($1 == '') then
  set argument = empty
else
  set argument = $1
endif
switch ($argument)
case -q:
        echo QUEUE | mail SEARCH@GENBANK.BIO.NET
        echo Queue command send
        exit
        breaksw
case -*:
        echo mailfasta: Unknow flag \'$argument\'
        echo Use \'mailfasta -q\' to get the queue from the fasta mail server
        echo " or 'mailfasta [sequencefile1 sequencefile2 ..]'"
        exit
        breaksw
endsw
if ($argument == empty) then
set good=false
while ($good == false)
echo Enter the filename wich contains the sequence
set file=$<
if ($file != '') then
  if !(-f $file) then
    echo mailfasta: File \'$file\' does not exist
  else
    echo ' '
    more $file
    echo ' '
    echo 'Is this the right file ? ([yes]/no)'
    set choice = $<
    switch ($choice)
    case n*:
         breaksw
    default:
         set good=true
         breaksw
    endsw
  endif
endif
end
else if !(-f $argument) then
  echo mailfasta: File \'$argument\' does not exist
  exit
else
  set file = $argument
endif
if {(cid < $file)} then
      set type = DNA
    else
      set type = PROTEIN
    endif
echo ' '
echo Now using $type file: $file
echo '*********'
grep '>' $file
grep ';' $file
echo ' '
endif
set db = (genbank/all genbank/new genpept/all genpept/new embl/all embl/new swiss-prot/all nbrf genbank/primate genbank/rodent genbank/other_mammalian genbank/other_vertebrate genbank/invertebrate genbank/plant genbank/organelle genbank/bacterial genbank/structural_rna genbank/viral genbank/phage genbank/synthetic genbank/unannotated)
echo 'Which database(s) do you want to search ?'
echo ' '
if ($type == DNA) echo '    1   All GenBank sequences (including new seq since latest release)'
if ($type == DNA) echo '    2   The new GenBank entries'
if ($type == PROTEIN) echo '    3   All translated protein reading frames from GenBank'
if ($type == PROTEIN) echo '    4   The new entries of translated protein reading frames'
if ($type == DNA) echo '    5   All EMBL sequences (including the new sequences)'
if ($type == DNA) echo '    6   The new EMBL entries'
if ($type == PROTEIN) echo '    7   All SWISS-PROT protein entries'
if ($type == PROTEIN) echo '    8   All NBRF/PIR protien entries (results return very slow) '
echo ' '
if ($type == DNA) then
echo ' GenBank subdivisions (for faster searching)'
echo ' '
echo '    9   The primate sequences            16  The bacterial sequences'
echo '    10  The rodent sequences             17  The structural RNA sequences'
echo '    11  The other mammalian sequences    18  The viral sequences'
echo '    12  The other vertebrate seq         19  The phage sequences'
echo '    13  The invertebrate sequences       20  The synthetic sequences'
echo '    14  The plant sequences              21  The unannotated sequences'
echo '    15  The organelle sequences'
echo ' '
endif
set good=false
while ($good == false)
  echo 'Enter the number(s) of your choice (seperated by a <SPACE>)'
  set choice=$<
  if ("$choice" != "") set good = true
  foreach i ($choice)
  if ($type == DNA) then
    switch ($i)
    case 1:
    case 2:
    case 5:
    case 6
    case 9:
    case 10:
    case 11:
    case 12:
    case 13:
    case 14:
    case 15:
    case 16:
    case 17:
    case 18:
    case 19:
    case 20:
    case 21:
         breaksw
    default:
         echo mailfasta: Invallid choice \'$i\'
         set good = false
         breaksw
    endsw
  else
   switch ($i)
   case 3:
   case 4:
   case 7:
   case 8:
        breaksw
   default:
         echo mailfasta: Invallid choice \'$i\'
         set good = false
         breaksw
   endsw
  endif
end
end
set good=false
while ($good == false)
  echo ' '
  echo 'Enter the sensitivity (lower number means more sensitive)'
  if ($type == DNA) then
    echo '  (3..6) [4]'
  else
    echo '  (1..2) [1]'
  endif
  set ktup=$<
  if ($ktup != '') then
    if ($type == DNA) then
      switch ($ktup)
      case [3-6]:
         set good=true
         breaksw
      default:
         echo mailfasta: Invalid choice \'$ktup\'
         breaksw
    endsw
    else
      switch ($ktup)
      case [1-2]:
         set good=true
         breaksw
      default:
         echo mailfasta: Invalid choice \'$ktup\'
         breaksw
      endsw
    endif
else
    set good=true
  endif
end
echo ' '
echo 'Enter the maximum number of matched sequences [100]'
set scores=$<
echo ' '
echo 'Enter the maximum number of alignments [20]'
set align=$<
grep ">" $file > /tmp/mf2$$
set grp = `cat /tmp/mf2$$`
foreach i ($choice)
  if ($i != 8) then
   echo DATALIB $db[$i] >> /tmp/mf$$
   if ($ktup != '') echo KTUP $ktup >> /tmp/mf$$
   if ($scores != '') echo SCORES $scores >> /tmp/mf$$
   if ($align != '') echo ALIGNMENTS $align >> /tmp/mf$$
   echo BEGIN >> /tmp/mf$$
   if (-z /tmp/mf2$$) echo ">"$file $type file >> /tmp/mf$$
   grep -v \; $file >> /tmp/mf$$
   mail SEARCH@GENBANK.BIO.NET < /tmp/mf$$
  else
   echo LIB nbrf >> /tmp/mf$$
   if ($ktup != '') echo WORD $ktup >> /tmp/mf$$
   if ($scores != '') echo LIST $scores >> /tmp/mf$$
   if ($align != '') echo ALIGN $align >> /tmp/mf$$
   if !(-z /tmp/mf2$$) then
    echo TITLE $grp >> /tmp/mf$$
    echo SEQ >> /tmp/mf$$
    grep -v ">" $file >> /tmp/mf$$
   else
    echo TITLE $file $type file >> /tmp/mf$$
    echo SEQ >> /tmp/mf$$
    grep -v \; $file >> /tmp/mf$$
   endif
  mail FASTA@EMBL.BITNET < /tmp/mf$$
  endif
rm /tmp/mf$$
end
if ($2 == '') then
  set stop = true
else
  shift
endif
end

SHAR_EOF
cat << \SHAR_EOF > 'getentry'
#/bin/csh
if ($1 != '') then
  foreach i ($*)
    echo $i | mail retrieve@genbank.bio.net
    end
else
  echo getentry: Retrieve a sequence database entry via mail from GenBank
  echo 'Usage: getentry  entry [entry entry ....]'
  echo ' entry can be an accession number or a locus name'
endif
SHAR_EOF

cat << \SHAR_EOF > 'mailfasta.doc'

MAILFASTA and GETENTRY

NAME
     mailfasta - send a sequence file for comparison with sequence databases

     getentry - get an entry from the sequence databases

SYNOPSIS
     mailfasta -q
     mailfasta [sequencefile1 sequencefile2 ...]

     getentry entry1 [entry2 ...]

DESCRIPTION
   
     MAILFASTA
    
     The GenBank site (genbank.bio.net) and the EMBL site (embl.bitnet)
     contain several sequence databases. A sequence can be compared with these
     databases by sending a specially formatted email message to the sites
     mail-server. On the sites the sequence is compared to the databases with
     the FASTA program written by Bill Pearson. The FASTA program also finds
     related sequences.

     Mailfasta is a interactive program which reads a sequence file (either
     DNA or (one letter coded) protein) and asks which databases is to be
     searched. Then it asks the sensitivity of the search and the maximum
     number of best-ranked results. It will send the sequence to the apropriate
     site and the results of the search will be e-mailed back.
     The sequence file must be an ascii file and the first line can be a comment
     line by starting it with '>'. The program will also read and correctly
     handle sequencefiles made by the Macintosh program 'DNA strider 1.0'. a
     program written by Christian Marck. The sequence does not have to be
     continuous, spaces are allowed, but no other characters, like numbers.

     The list of waiting jobs at the GenBank site can be obtained by using
     the option -q.

     GETENTRY
    
     Entries of interest can be obtained with the program getentry.
     entry can be either the locus name or the accession number.
     The entries will be e-mailed back.

DATABASES
     The following databeses can be used.

       All GenBank entries plus the new entries added since the latest release.
       The added sequences only.
       All translated reading frames from the latest GenBank release plus the
        added sequences.
       The added translated reading frames only.
       All EMBL entries plus the new entries added since the latest release.
       The new EMBL entries only.
       The Swiss-prot protein database.
       The NBRF/PIR protein database.
  
     The NBRF/PIR database is on the EMBL site the others are on the GenBank
     site.

BUGS
     If the NBRF/PIR database is to be searched, it will not appear in the
     list of waiting jobs, because the EMBL sites does not support a waiting
     queue list.
     Most of the results will be available within a few minutes except for
     comparisons with the NBRF/PIR database, which can take over an hour.

FILES
     cid                A c program which checks if a file is a DNA sequence
                        or a protein sequence. If a file contains more than
                        85% A, C, G and T's it is considered to be a DNA file.
                        This file must be present.
     mailfasta.doc      This documentation
     /tmp/mf$$          Temporary storage of the email message.
     

Last change: 3 July 1990
SHAR_EOF

cat << \SHAR_EOF > 'cid.c'

#include <stdio.h>

main ()
{
 int ca,cc,cg,ct,total,i;
 float result;
 char ch;
 ca=cc=cg=ct=total=i=0;
 while((ch = getchar()) == ';') readln();
 if (ch =='>') readln();
 while((ch = getchar()) != EOF)
 { switch (ch)
   { case 'a':
     case 'A': ca++; break;
     case 'c':
     case 'C': cc++; break;
     case 't':
     case 'T': ct++; break;
     case 'g':
     case 'G': cg++; break;
   }
   if (((ch>=65)&&(ch<=90))||((ch>=97)&&(ch<=122))) total++;
}
 result = (float)(ca+cc+ct+cg)/total;
 if (result>=.85) exit(0);
 else exit(1);
}

readln()
{
 while(getchar() != 10);
}
SHAR_EOF

cc -o cid cid.c
rm cid.c
chmod 755 mailfasta getentry cid
mkdir mf
mv mailfasta getentry cid mailfasta.doc mf