#!/usr/local/bin/gawk -f #@ purpose: - reformat ATOM records for various conventions #@ - set B-factors for residues and/or atoms # # change bfactors: # feed it lines like these before the pdb-file: # # bfac res 12 ala 20.00 # bfac res 12 ala model 1 20.00 # for single model in cluster # bfac atom 12 ala ha 20.00 # pdb2pdb.awk bfacfile pdb-file # gawk -f `which pdb2pdb.awk` bfacfile pdb-file # or # echo bfac res 11 ser 37.50 | gawk -f pdb2pdb.awk - pdb-file # # options: # -v pedantic=1 : do not change "2HB " to "HB2 " # -v preloc=1 : change " HXn" to "nHX" for X=[A-Z][0-9]? n=[1-3] # -v preloc=2 : change "HXXn " to "nHXX" for n=[0-9] # -v keepq=1 : keep pseudo-atoms [Q*] default keepq=0 # -v defaultcharge=1 : change to ARG+ HIS LYS+ ASP- GLU- ( for molmol ) # -v use_occ=1 : print occupancy records [ default=1 if not already present ] # -v use_bfac=1 : print Bfactor records [ default=0 if not already present ] # -v gmx=1 : prepare for gromacs [gmx] ile cd1->cd ile hd11-> hd1 etc # # -v dyana=1 : prepare for dyana [non-iupac] # # -v cyana=1 : prepare for cyana [iupac] # # -v dres=0 : add dres=residuenumber # -v skipsol=0 : skip residue name=SOL [remove water] # # -v fixnum=1 : fix numbering to ascending, every resnr has only one resname # (unlike xplor output) # -v fixchain=1 : add chains xplor files with different segids and repair HG11->1HG1 # -v upper=1 : make uppercase # BEGIN { progname="pdb2pdb.awk" if (ARGV[1]~/^[-]?(h|help)$/) { print ARGV[0] system("grep '^#' `which "progname"`|more") exit } } FNR==1 { stderr="/dev/stderr" if ( ARGV[0]=="mawk" ) { print "use gawk instead of mawk!!";exit } FIELDWIDTHS="6 5 1 4 1 3 1 1 4 1 3 8 8 8 6 6 1 3 2 4 2 2" # SAVEWIDTHS="6 5 1 4 1 3 1 1 4 1 3 8 8 8 6 6 1 3 2 4 2 2" OFS="" nfa=split(FIELDWIDTHS,ff,/ /) for (i=1;i<=nfa;i++) sfa+=ff[i] if (debug) print "sfa="sfa";nfa="nfa if (!keepq) noq=1 if (preloc) pedantic=1 if (fixchain==1) preloc=1 #==============================================================================# #=================== the pdb ATOM format description ==========================# #==============================================================================# #TOM1122222344445666789999011122222222333333334444444455555566666678889900001122 #TOM1122222 44445666 899990 222222223333333344444444555555666666 888 00001122 # COLUMNS DATA TYPE FIELD DEFINITION #1 1 - 6 Record name "ATOM" #2 7 - 11 Integer serial Atom serial number #3 12 Space(1) #4 13 - 16 Atom name Atom name #5 17 Character altLoc Alternate location indicator #6 18 - 20 Residue name resName Residue name #7 19 Space(1) #8 22 Character chainID Chain identifier #9 23 - 26 Integer resSeq Residue sequence number #0 27 Character iCode Code for insertion of residues #1 28 - 30 Space(3) #2 31 - 38 Real(8.3) x Orthogonal coordinates for X #3 39 - 46 Real(8.3) y Orthogonal coordinates for Y #4 47 - 54 Real(8.3) z Orthogonal coordinates for Z #5 55 - 60 Real(6.2) occupancy Occupancy #6 61 - 66 Real(6.2) tempFactor Temperature factor #7 67 Space(1) #8 68 - 70 Integer ftNote Footnote number, being deprecated #9 71 - 72 Space(3) #0 73 - 76 LString(4) segID Segment identifier, left-justified #1 77 - 78 LString(2) element Element symbol, right-justified #2 79 - 80 LString(2) charge Charge on the atom, IUPAC form # treat floats #12-#16 # faform[12]=faform[13]=faform[14]="%8.3f" faform[15]=faform[16]="%6.2f" default_occ=1 default_bfac=0 } FILENAME == "renum.in" { renum=1 split($0,nnn," ") if ( 2==split($0,nnn," ") ) newnum[nnn[1]]=nnn[2] print "renumbering newnum["nnn[1]"]="nnn[2]>stderr next } #debug { print ";"$0 } #read : # bfac res 12 ala 20.00 # bfac res 12 ala model 1 20.00 # bfac atom 12 ala ha 20.00 /^bfac +res/&&!/model/ { # print > stderr bfacr=1 # flag split($0,bfr,/[\t ]+/) if (upper) bfr[4]=toupper(bfr[4]) # bfacres[bfr[3]" "bfr[4]]=bfr[5] bfacres[bfr[3]]=bfr[5] print "bfac[R]: " bfr[3] " -> " bfr[5] > stderr next } # bfac res 12 ala model 1 20.00 /^bfac +res/&&/model/ { # print > stderr bfacrm=1 # flag split($0,bfrm,/[\t ]+/) if (upper) bfrm[4]=toupper(bfrm[4]) bfacresm[bfrm[3]" "bfrm[6]]=bfrm[7] print "bfac[RM]: "bfrm[3]" "bfrm[6]" -> " bfrm[7] > stderr next } /bfac +atom/ { # print > stderr bfaca=1 # flag split($0,bfa,/[\t ]+/) if (upper) { bfa[4]=toupper(bfa[4]) bfa[5]=toupper(bfa[5]) } bfacatom[bfa[3]" "bfa[5]]=bfa[6] # bfacatom[bfa[3]" "bfa[4]" "bfa[4]]=bfr[6] print "bfac:[A] ",bfa[3]" "bfa[5]" -> " bfa[6] > stderr next } /^MODEL/ { model=substr($0,6)+0 MODEL[FILENAME]=model } /^(ATOM|HETATM)/ { if (FILENAME in MODEL) { #pass } else { #MODEL 1 MODEL[FILENAME]=++model printf "MODEL %6i\n",MODEL[FILENAME] } } /^(ATOM|HETATM)/ { name=$4; altLoc=$5 # check if atomname runs over in altLoc [ field 17 ] if (substr($0,17,1)!=" ") { if (!warn_altLoc++) { print "Warning: part of atomname found in altLoc position [pos 17]", "at line="FNR";suppressing similar warnings" > stderr } name=name altLoc ; altLoc="" #print "new name: "name"["altLoc"]" > stderr } # check if protons have 2HB format [ only if pedantic is not set] if (name~/^ *[123]H/ && !pedantic && !preloc) { hnr=name+0 sub(/^ *[123]/,"",name);sub(/ *$/,"",name) name=name""hnr if (!warn_numprot++) { print "Warning: protons have 2HB type format.. changing it. ", "at line="FNR";suppressing similar warnings" > stderr } #print "new name: "name"["altLoc"]" > stderr } if (name~/^ *H[ABGDEZH][12]?[123] *$/ && preloc) { if (!(name~/^ *(HD[123]|[123]HD) *$/ && $6 ~ /ILE/ )) { gsub(/ /,"",name) hnr=substr(name,length(name)) name=substr(name,1,length(name)-1) name=hnr""name if (!warn_numprot++) { print "Warning: converting to 2HB type format.. changing it. ", "at line="FNR";suppressing similar warnings" > stderr } #print "new name: "name"["altLoc"]" > stderr } } else if (name~/^ *H[A-Z0-9][A-Z0-9][0-9] *$/ && preloc==2) { if (!(name~/^ *(HD[123]|[123]HD) *$/ && $6 ~ /ILE/ )) { gsub(/ /,"",name) hnr=substr(name,length(name)) name=substr(name,1,length(name)-1) name=hnr""name if (!warn_numprot2++) { print "Warning: converting to 2HXX type format.. changing it. ", "at line="FNR";suppressing similar warnings" > stderr } #print "new name: "name"["altLoc"]" > stderr } } skip = ((noq && name ~/^ *Q/) || ($6=="SOL" && skipsol) ) skip = ( !keeplinker && $6~/^(LL|LP|PL)[0-9]? *$/ ) # if (out!="cyana") gsub(/ /,"",name) if (gmx) { if ($6=="ILE" && name~/^([123]HD1|HD1[123]|CD1)$/) { sub(/D1/,"D",name) if (!gmx_ilecd_warn++) print "Warning: converting ILE [CH]D1 -> [CH]D" > stderr } if (name~/^OT1$/) { name ="O" if (!gmx_ot1_warn++) print "Warning: converting terminus OT1 -> O" > stderr } if (name~/^OT2$/) { name ="OXT" if (!gmx_ot2_warn++) print "Warning: converting terminus OT2 -> OXT" > stderr } } # treat floats #12-#16 # if (debug) print ";;"$12,$13,$14":"$0 for (f=12;f<=14;f++) FLF[f]=substr(sprintf(faform[f],$f),1,ff[f]) # if (debug) print ";;;"$12,$13,$14":"$0 # enter bfactors if any are read. if (bfacr || bfaca || bfacrm ) { # printf " " > stderr Bresnr=$9+0 #; printf Bresnr";" > stderr Bresnam=$6 #; printf Bresnam";" > stderr Batom=name #; printf Batom > stderr idr=Bresnr ; idrm=Bresnr" "model ; ida=Bresnr" "Batom #print idrm" "( idrm in bfacresm ) if ( idr in bfacres ) $16=bfacres[idr] if ( idrm in bfacresm ) $16=bfacresm[idrm] if ( ida in bfacatom ) $16=bfacatom[ida] } # if you want to print occupancies and bfactor if (use_occ && $15~/^ *$/) $15=default_occ if (use_bfac && $16~/^ *$/) $16=default_bfac for (f=15;f<=16;f++) { if ($f!~/^ *$/) FLF[f]=substr(sprintf(faform[f],$f),1,ff[f]) else FLF[f]=$f } if (out=="dyana" || dyana==1) { defaultcharge=1 defaulthispos=1 if ($6=="ILE" && name~/^([123]HD|HD[123]|CD)$/) sub(/D/,"D1",name) if (name=="H") name="HN" if (name~"^(HT?1|1HT?)$") name="HN" if (name~"^(OT?1|1OT?)$") name="O" else if (name~"^(HT?[123]|[123]HT?)$") skip=1 else if (name ~ "^(OXT|OT?[12])$" ) skip=1 else if (name~"^M") skip=1 if (length(name)<=3 && name~/^[A-Z]/) { name=" "name }quit if (name~/^ *H[ABGDEZH][12]?[123] *$/) if ( !($6~/(HIS|TRP|PHE|TYR)/ && name~/H[DEHZ]/) ) { gsub(/ /,"",name) hnr=substr(name,length(name)) name=substr(name,1,length(name)-1) name=hnr""name if (!warn_numprot++) { print "Warning: converting to 2HB type format.. changing it. ", "at line="FNR";suppressing similar warnings" > stderr } } } if (out=="cyana" || cyana==1) { defaultcharge=0 defaulthispos=0 if ($6=="ILE" && name~/^([123]HD|HD[123]|CD)$/) sub(/D/,"D1",name) if (name=="HN") name="H" if (name~"^(HT?1|1HT?)$") name="H" if (name~"^(OT?1|1OT?)$") name="O" else if (name~"^(HT?[123]|[123]HT?)$") skip=1 else if (name ~ "^(OXT|OT?[12])$" ) skip=1 else if (name~"^M") skip=1 if (length(name)<=3 && name~/^[A-Z]/) { name=" "name }quit if (name~/^ *H[ABGDEZH][12]?[123] *$/) if ( !($6~/(HIS|TRP|PHE|TYR)/ && name~/H[DEHZ]/) ) { gsub(/ /,"",name) hnr=substr(name,length(name)) name=substr(name,1,length(name)-1) name=hnr""name if (!warn_numprot++) { print "Warning: converting to 2HB type format.. changing it. ", "at line="FNR";suppressing similar warnings" > stderr } } } if ( pedantic && length(name)<=3 && name~/^[A-Z]/) { name=" "name } if (defaultcharge==1) { defaultpos=1;defaultneg=1 } if (defaultpos) { if (defaultpos==1) defaultpos="ARG|LYS" if ($6~"^" defaultpos "$") $7="+" # $6=$6"+" } if (defaultneg) { if (defaultneg==1) defaultneg="ASP|GLU" if ($6~"^" defaultneg "$") $7="-" # $6=$6"-" } if (defaulthispos && $6~/HIS/) { if (defaulthispos==1) defaulthispos="HIS" if ($6~"^" defaulthispos "$") $7="+" # $6=$6"+" } if (gmx) { $7=" " } if (cns==1||1) { $7=" " gsub(/ /,"",name) name=" "name if (length(name)>4) sub(/^ /,"",name) name=substr(name,1,4) } #if (length(name)==3 && name~/^ [0-9]/) { if (name~/^ [1-3]/) { sub("^ ","",name) #print name > stderr } atomform=\ "%-6s%5i %-4s%1s%-3s%1s%1s%4i%1s %8s%8s%8s%6s%6s %3s %4s%2s%2s" tmp="1 2 3 4 5 6 7 8" if (debug) for (f=12;f<=16;f++) print "["f"]"$f resnum=$9+dres if (renum==1) resnum = newnum[resnum] resnam=$6 segid=$20 chainid=$8 # fix xplor pdbs with can have residues with same res-nr but different segid # this confuses molmol chains="ABCDEFGHIJKLMNOPQRSTUVWXYZ" if (fixchain) { if (prevsegid!=segid) { chainid=substr(chains,++chainnr,1) } else { chainid=substr(chains,chainnr,1) } #printf "%s %s %s\n",resnam,resnum,segid>stderr prevsegid=segid } else if (fixnum) { if (resnum+0==prevnum+0 && prevnam==resnam && prevsegid==segid) { # ok } else { while (resnum+0 stderr resnum+=100 if (resnum+0==prevnum+0 && prevnam!=resnam && prevsegid!=segid) { resnum+=100 } } } #printf "%s %s %s\n",resnam,resnum,segid>stderr prevnum=resnum prevnam=resnam prevsegid=segid } if (renum==1) resnum = newnum[resnum] news0=sprintf(atomform,$1,$2,name,altLoc,resnam,$7,chainid,resnum,$10, FLF[12],FLF[13],FLF[14],FLF[15],FLF[16],$18,segid,$21,$22) # FLF[12],FLF[13],FLF[14],FLF[15],FLF[16],$18,$20,$21,$22) # skip trailing spaces ,makes smaller pdb's. # sub(/ *$/,"",news0) if (upper) news0=toupper(news0) if (!skip) { print news0;next } news0 = $0 } 1