#!/bin/bash # THESEUS: Maximum likelihood superpositioning of multiple # macromolecular structures # # -/_|:|_|_\- # Douglas L. Theobald # Department of Biochemistry # Brandeis University # Waltham, MA 02454 # # dtheobald@brandeis.edu # dtheobald@gmail.com # # theseus_align # # Usage: theseus_align [theseus_options] -f pdbfile1.pdb pdbfile2.pdb ... # # 'theseus_align' allows for a quick-and-dirty way to ML superposition proteins # with different sequences. It should work very well when the protein sequences # are relatively similar, although the ML method will still give much better # results than least-squares when the sequences are moderately divergent. # Technically, this procedure gives a structure-based superposition of a # sequence-based alignment. It _does not_ perform a structure-based alignment. # # First, the script uses THESEUS to create FASTA formatted sequence files # corresponding to the exact protein sequences found in the pdb files that you # supply. # # Second, these sequences are aligned using the multiple sequence alignment # program of your choice -- currently set up for MUSCLE and easily modified for # CLUSTALW, T_COFFEE, KALIGN, DIALIGN2, or MAFFT. Any multiple sequence # alignment program can be used, as long as it can generate clustal-formatted # files. However, I highly recommend Bob Edgar's MUSCLE program for both its # speed and accuracy. It is easy to install using either precompiled binaries or # by compiling from scratch: # http://www.drive5.com/muscle/ # # Third, THESEUS performs a superposition of the structures using the sequence # alignment as a guide. # # The following six constant strings should be modified to whatever is # convenient and applicable. theseus="/usr/local/bin/theseus"; # where to find the THESEUS binary executable fastafile="theseus.fasta"; filemapfile="theseus.filemap"; alignmentfile="theseus.aln"; # for MUSCLE -- http://www.drive5.com/muscle/ alignprog="/usr/local/bin/muscle"; align_cmd="${alignprog} -maxiters 32 -in ${fastafile} -out ${alignmentfile} -clwstrict"; # for CLUSTALW -- ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/ #alignprog="/usr/local/bin/clustlaw"; #align_cmd="${alignprog} -infile=${fastafile} -outfile=${alignmentfile}"; # for MAFFT -- http://www.biophys.kyoto-u.ac.jp/%7Ekatoh/programs/align/mafft/ #alignprog="/usr/local/bin/mafft"; #align_cmd="${alignprog} --maxiterate 1000 --localpair --clustalout ${fastafile} > ${alignmentfile}"; # for T_COFFEE -- http://igs-server.cnrs-mrs.fr/%7Ecnotred/Projects_home_page/t_coffee_home_page.html #alignprog="/usr/local/bin/t_coffee"; #align_cmd="${alignprog} ${fastafile} -outfile=${alignmentfile}"; # for KALIGN -- http://msa.cgb.ki.se/ #alignprog="/usr/local/bin/kalign" #align_cmd="${alignprog} -i ${fastafile} -f aln | sed 's/Kalign/CLUSTALW/' > ${alignmentfile}"; # for DIALIGN2 -- http://bibiserv.techfak.uni-bielefeld.de/dialign/ #alignprog="/usr/local/bin/dialign2" #align_cmd="${alignprog} -cw ${fastafile}; sed 's/\/\///' ${fastafile%.*}.cw | sed 's/DIALIGN/CLUSTALW/'"; ################################################################################ ################################################################################ # NOTHING BELOW HERE SHOULD BE CHANGED ################################################################################ if [ ! -f ${alignprog} ] || [ ! -x ${alignprog} ] || [ ! -s ${alignprog} ] then printf "\nERROR: Problem with multiple sequence aligment executable, ${alignprog}\n"; ${alignprog}; exit 1; elif [ ! -f ${theseus} ] || [ ! -x ${theseus} ] || [ ! -s ${theseus} ] then printf "\nERROR: Problem with THESEUS executable, ${alignprog}\n"; ${theseus}; exit 1; fi usage="Usage: ${0} [theseus_options] -f pdbfile1.pdb pdbfile2.pdb ..."; declare -a opts=( $@ ); # save the command line arguments (( argc = $# )); # save the number of command line arguments # shift up until we get past '-f', which signifies that the rest are files (( optn = 0 )); while [ "${1}" != "-f" ] && [ ! -z "${1}" ] do shift; (( optn++ )); done # Ensure that there is something on the command line if [[ -z "$@" ]] # double brackets don't do word splitting then printf "\n${usage}\n\n"; exit 1; else shift; fi for (( i = optn; i < argc; ++i )) do unset opts[${i}]; # nix the pdb files from the options array done pdbs="$@"; echo ${pdbs} # Make sure there are pdb files on the command line if [[ -z "${pdbs}" ]] # double brackets don't do word splitting then printf "\n${usage}\n\n"; exit 1; else shift; fi # Check each pdb file to see if it exists, if it is readable, and if it is non-empty for pdb in ${pdbs} do if [ ! -f ${pdb} ] || [ ! -r ${pdb} ] || [ ! -s ${pdb} ] then printf "\nProblem with file: ${pdb}\n" printf "\n${usage}\n\n"; exit 1; fi done theseus_cmd="${theseus} ${opts[@]} -f -M ${filemapfile} -A ${alignmentfile} ${pdbs}"; # Use THESEUS to make fasta sequence files corresponding to each pdb ${theseus} -f -F ${pdbs}; if [ ! $? ] then printf "\nERROR: THESEUS did not successfully create all FASTA sequence files.\n"; printf "\n${usage}\n\n"; exit 1; fi for pdb in ${pdbs} do fasta="${pdb}.fst"; if [ ! -f ${fasta} ] || [ ! -r ${fasta} ] || [ ! -s ${fasta} ] then printf "\nProblem with FASTA sequence file ${fastafile} for ${pdb}\n" printf "\n${usage}\n\n"; exit 1; fi done if [ -f ${fastafile} ] then rm ${fastafile}; fi # Concatenate all fasta files into one large multiple sequence fasta file for pdb in ${pdbs} do cat ${pdb}.fst >> ${fastafile}; done #ls -1 ${pdbs} | awk '{print $1" "$1}' > ${filemapfile}; if [ -f ${filemapfile} ] then rm ${filemapfile}; fi # Make the mapfile for THESEUS to use (which sequence corresponds to which file) for pdb in ${pdbs} do echo "${pdb} ${pdb}" >> ${filemapfile}; done # Align the sequences printf "\n\n${align_cmd}\n"; ${align_cmd}; if [ ! $? ] then printf "\nERROR: Sequence alignment failed.\n"; printf "\n${usage}\n\n"; exit 1; fi # Superimpose with THESEUS based on the sequence alignment generated above printf "\n\n${theseus_cmd}\n"; ${theseus_cmd}; exit 0;