deboer@balaena.bio.vu.nl (Thon de Boer) (07/17/90)
I wrote a program which uses the services offered by GenBank and EMBL for doing FASTA searches. It will send a sequence to a mail-server which will do a sequence similarity search against the specified databases using the FASTA program written by William Pearson and David Lipman. The program runs interactive and is easy to use. It can read files in the Pearson format, DNA strider files or plain DNA and PROTEIN files with no comment lines. For more information about the FASTA mail servers you can contact GenBank or EMBL. (send "HELP" in the subject field of a mail message to SEARCH@GENBANK.BIO.NET or FASTA@EMBL (bitnet/EARN etc)) or FASTA@EMBL (bitnet/EARN etc)) Hope you can use it ------------------------------------------------------------------------------- _______ ___ Thon de Boer / /__/ Dept. of Microbiological physiology // __ __ _ __ / \ __ __ __ Biological Faculty //-- / // / / \ /_/ / // //_// ' Vrije Universiteit, Amsterdam, Holland // //_// / /__//_ /___//_//_ / deboer@bio.vu.nl ------------------------------------------------------------------------------- #! /bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #! /bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create: # mailfasta # cid # getentry # mailfasta.doc # in directory mf cat << \SHAR_EOF > 'mailfasta' #/bin/csh echo ' ' echo ' ' echo ' MailFasta' echo ' *********' echo By Thon de Boer echo Department of Microbiological Fysiology echo Vrije universiteit AMSTERDAM Holland. echo This program is in the Public Domain echo 'Send comments to deboer@bio.vu.nl (email).' echo ' ' echo Use \'mailfasta -q\' to get the queue from the fasta mail server echo " or 'mailfasta [sequencefile1 sequencefile2 ..]'" echo ' ' echo This program will read a sequence file and mail it to echo 'GenBank (or EMBL) were it will be scanned against the sequence databases' echo 'using the FASTA program' echo ' ' echo ' ' set stop = false while ($stop == false) if ($1 == '') then set argument = empty else set argument = $1 endif switch ($argument) case -q: echo QUEUE | mail SEARCH@GENBANK.BIO.NET echo Queue command send exit breaksw case -*: echo mailfasta: Unknow flag \'$argument\' echo Use \'mailfasta -q\' to get the queue from the fasta mail server echo " or 'mailfasta [sequencefile1 sequencefile2 ..]'" exit breaksw endsw if ($argument == empty) then set good=false while ($good == false) echo Enter the filename wich contains the sequence set file=$< if ($file != '') then if !(-f $file) then echo mailfasta: File \'$file\' does not exist else echo ' ' more $file echo ' ' echo 'Is this the right file ? ([yes]/no)' set choice = $< switch ($choice) case n*: breaksw default: set good=true breaksw endsw endif endif end else if !(-f $argument) then echo mailfasta: File \'$argument\' does not exist exit else set file = $argument endif if {(cid < $file)} then set type = DNA else set type = PROTEIN endif echo ' ' echo Now using $type file: $file echo '*********' grep '>' $file grep ';' $file echo ' ' endif set db = (genbank/all genbank/new genpept/all genpept/new embl/all embl/new swiss-prot/all nbrf genbank/primate genbank/rodent genbank/other_mammalian genbank/other_vertebrate genbank/invertebrate genbank/plant genbank/organelle genbank/bacterial genbank/structural_rna genbank/viral genbank/phage genbank/synthetic genbank/unannotated) echo 'Which database(s) do you want to search ?' echo ' ' if ($type == DNA) echo ' 1 All GenBank sequences (including new seq since latest release)' if ($type == DNA) echo ' 2 The new GenBank entries' if ($type == PROTEIN) echo ' 3 All translated protein reading frames from GenBank' if ($type == PROTEIN) echo ' 4 The new entries of translated protein reading frames' if ($type == DNA) echo ' 5 All EMBL sequences (including the new sequences)' if ($type == DNA) echo ' 6 The new EMBL entries' if ($type == PROTEIN) echo ' 7 All SWISS-PROT protein entries' if ($type == PROTEIN) echo ' 8 All NBRF/PIR protien entries (results return very slow) ' echo ' ' if ($type == DNA) then echo ' GenBank subdivisions (for faster searching)' echo ' ' echo ' 9 The primate sequences 16 The bacterial sequences' echo ' 10 The rodent sequences 17 The structural RNA sequences' echo ' 11 The other mammalian sequences 18 The viral sequences' echo ' 12 The other vertebrate seq 19 The phage sequences' echo ' 13 The invertebrate sequences 20 The synthetic sequences' echo ' 14 The plant sequences 21 The unannotated sequences' echo ' 15 The organelle sequences' echo ' ' endif set good=false while ($good == false) echo 'Enter the number(s) of your choice (seperated by a <SPACE>)' set choice=$< if ("$choice" != "") set good = true foreach i ($choice) if ($type == DNA) then switch ($i) case 1: case 2: case 5: case 6 case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16: case 17: case 18: case 19: case 20: case 21: breaksw default: echo mailfasta: Invallid choice \'$i\' set good = false breaksw endsw else switch ($i) case 3: case 4: case 7: case 8: breaksw default: echo mailfasta: Invallid choice \'$i\' set good = false breaksw endsw endif end end set good=false while ($good == false) echo ' ' echo 'Enter the sensitivity (lower number means more sensitive)' if ($type == DNA) then echo ' (3..6) [4]' else echo ' (1..2) [1]' endif set ktup=$< if ($ktup != '') then if ($type == DNA) then switch ($ktup) case [3-6]: set good=true breaksw default: echo mailfasta: Invalid choice \'$ktup\' breaksw endsw else switch ($ktup) case [1-2]: set good=true breaksw default: echo mailfasta: Invalid choice \'$ktup\' breaksw endsw endif else set good=true endif end echo ' ' echo 'Enter the maximum number of matched sequences [100]' set scores=$< echo ' ' echo 'Enter the maximum number of alignments [20]' set align=$< grep ">" $file > /tmp/mf2$$ set grp = `cat /tmp/mf2$$` foreach i ($choice) if ($i != 8) then echo DATALIB $db[$i] >> /tmp/mf$$ if ($ktup != '') echo KTUP $ktup >> /tmp/mf$$ if ($scores != '') echo SCORES $scores >> /tmp/mf$$ if ($align != '') echo ALIGNMENTS $align >> /tmp/mf$$ echo BEGIN >> /tmp/mf$$ if (-z /tmp/mf2$$) echo ">"$file $type file >> /tmp/mf$$ grep -v \; $file >> /tmp/mf$$ mail SEARCH@GENBANK.BIO.NET < /tmp/mf$$ else echo LIB nbrf >> /tmp/mf$$ if ($ktup != '') echo WORD $ktup >> /tmp/mf$$ if ($scores != '') echo LIST $scores >> /tmp/mf$$ if ($align != '') echo ALIGN $align >> /tmp/mf$$ if !(-z /tmp/mf2$$) then echo TITLE $grp >> /tmp/mf$$ echo SEQ >> /tmp/mf$$ grep -v ">" $file >> /tmp/mf$$ else echo TITLE $file $type file >> /tmp/mf$$ echo SEQ >> /tmp/mf$$ grep -v \; $file >> /tmp/mf$$ endif mail FASTA@EMBL.BITNET < /tmp/mf$$ endif rm /tmp/mf$$ end if ($2 == '') then set stop = true else shift endif end SHAR_EOF cat << \SHAR_EOF > 'getentry' #/bin/csh if ($1 != '') then foreach i ($*) echo $i | mail retrieve@genbank.bio.net end else echo getentry: Retrieve a sequence database entry via mail from GenBank echo 'Usage: getentry entry [entry entry ....]' echo ' entry can be an accession number or a locus name' endif SHAR_EOF cat << \SHAR_EOF > 'mailfasta.doc' MAILFASTA and GETENTRY NAME mailfasta - send a sequence file for comparison with sequence databases getentry - get an entry from the sequence databases SYNOPSIS mailfasta -q mailfasta [sequencefile1 sequencefile2 ...] getentry entry1 [entry2 ...] DESCRIPTION MAILFASTA The GenBank site (genbank.bio.net) and the EMBL site (embl.bitnet) contain several sequence databases. A sequence can be compared with these databases by sending a specially formatted email message to the sites mail-server. On the sites the sequence is compared to the databases with the FASTA program written by Bill Pearson. The FASTA program also finds related sequences. Mailfasta is a interactive program which reads a sequence file (either DNA or (one letter coded) protein) and asks which databases is to be searched. Then it asks the sensitivity of the search and the maximum number of best-ranked results. It will send the sequence to the apropriate site and the results of the search will be e-mailed back. The sequence file must be an ascii file and the first line can be a comment line by starting it with '>'. The program will also read and correctly handle sequencefiles made by the Macintosh program 'DNA strider 1.0'. a program written by Christian Marck. The sequence does not have to be continuous, spaces are allowed, but no other characters, like numbers. The list of waiting jobs at the GenBank site can be obtained by using the option -q. GETENTRY Entries of interest can be obtained with the program getentry. entry can be either the locus name or the accession number. The entries will be e-mailed back. DATABASES The following databeses can be used. All GenBank entries plus the new entries added since the latest release. The added sequences only. All translated reading frames from the latest GenBank release plus the added sequences. The added translated reading frames only. All EMBL entries plus the new entries added since the latest release. The new EMBL entries only. The Swiss-prot protein database. The NBRF/PIR protein database. The NBRF/PIR database is on the EMBL site the others are on the GenBank site. BUGS If the NBRF/PIR database is to be searched, it will not appear in the list of waiting jobs, because the EMBL sites does not support a waiting queue list. Most of the results will be available within a few minutes except for comparisons with the NBRF/PIR database, which can take over an hour. FILES cid A c program which checks if a file is a DNA sequence or a protein sequence. If a file contains more than 85% A, C, G and T's it is considered to be a DNA file. This file must be present. mailfasta.doc This documentation /tmp/mf$$ Temporary storage of the email message. Last change: 3 July 1990 SHAR_EOF cat << \SHAR_EOF > 'cid.c' #include <stdio.h> main () { int ca,cc,cg,ct,total,i; float result; char ch; ca=cc=cg=ct=total=i=0; while((ch = getchar()) == ';') readln(); if (ch =='>') readln(); while((ch = getchar()) != EOF) { switch (ch) { case 'a': case 'A': ca++; break; case 'c': case 'C': cc++; break; case 't': case 'T': ct++; break; case 'g': case 'G': cg++; break; } if (((ch>=65)&&(ch<=90))||((ch>=97)&&(ch<=122))) total++; } result = (float)(ca+cc+ct+cg)/total; if (result>=.85) exit(0); else exit(1); } readln() { while(getchar() != 10); } SHAR_EOF cc -o cid cid.c rm cid.c chmod 755 mailfasta getentry cid mkdir mf mv mailfasta getentry cid mailfasta.doc mf