#!/usr/bin/python ''' This program expands molecular coordinates in pdb files using the BIOMT transformation matrix. Annotations other than coordinates are currently discarded. Support for secondary structure annotation may or may not be added at a later time. Rasmol and Pymol do a pretty good job at inferring secondary structure anyway, however. Send comments or bug reports to mpalmer at uwaterloo dot ca. --- Copyright Michael Palmer, University of Waterloo. Free for use in any way you see fit but you have to leave this notice in place and in effect in derived works. ''' import sys, urllib, gzip, cStringIO, re, os.path group_marker = 'APPLY THE FOLLOWING TO CHAINS:' url = 'watcut.uwaterloo.ca/makemultimer' usage = ''' MakeMultimer.py expands multimeric biological macromolecules from BIOMT records contained in pdb files. Usage: python MakeMultimer.py if is a local file, this file will be used. Alternatively, if it is a valid pdb identifier, the MakeMultimer.py will download it from the Protein Data Bank. ''' class PdbError(Exception): pass def load_remote_pdb(pdbcode): ''' obtain a pdb file by name from the protein data bank. what kinds of plausibility checks to perform? - the code should be four characters long - be alphanumeric ''' urltemplate = 'http://dx.doi.org/10.2210/pdb%s/pdb' pdbcode = pdbcode.split('.')[0].lower() if len(pdbcode) != 4 or re.findall('\W', pdbcode): raise PdbError, 'malformed pdb code' # ok, code looks o.k. - let's try url = urltemplate % pdbcode request = urllib.urlopen(url) binary = request.read() pseudo_file = cStringIO.StringIO(binary) try: extracted = gzip.GzipFile('','r',0,pseudo_file).read() except IOError: raise PdbError, 'invalid pdb code' return extracted class Atom(object): ''' ATOM 687 CA SER B 92 26.506 23.631 11.670 1.00 63.47 C HETATM 6747 K K G 1 34.346 68.181 0.000 0.50133.54 K prepare a template from this for output with transformed coordinates. ''' def __init__(self, rawline): self.rawline = rawline self.orig_number = self.number = int(rawline[6:11]) self.chain = rawline[21] self.x = float(rawline[30:38]) self.y = float(rawline[38:46]) self.z = float(rawline[46:54]) info0 = rawline[:6] info1 = rawline[11:21] info2 = rawline[22:30] info3 = rawline[54:] self.descriptor_template = '%s%%5d%s%%s%s' % (info0, info1, info2) self.coordinate_template = '%%8.3f%%8.3f%%8.3f%s' % info3 def transformed(self, *coords): ''' fill in new coordinates, leave descriptor template blank - will be filled in later. ''' return self.descriptor_template + self.coordinate_template % coords def __str__(self): return self.rawline __repr__ = __str__ class ReplicationGroup(object): ''' one biomolecule may contain several matrices that apply to separate sets of chains. So this is actually where the action is. We can do the replication here, but leave up the naming to the BioMolecule class. ''' def __init__(self, bm_lines, original_chains): self.original_chains = original_chains # reference to all chains in the pdb self.source_chains = set() self.replicated_chains = {} self.matrices = [] in_matrix = False matrix_lines = [] for line in bm_lines: if line.startswith(group_marker): line = line[len(group_marker):] if line.startswith('BIOMT'): in_matrix = True if not in_matrix: # this is still a line enumerating chains raw = line.split() self.source_chains.update([r.strip(',') for r in raw]) else: matrix_lines.append(line) # now, process matrix lines while matrix_lines: first, matrix_lines = matrix_lines[:3], matrix_lines[3:] new_matrix = [] for j in range(3): ml = first[j][11:] frags = ml.split() new_matrix.append([float(x) for x in ml.split() ]) self.matrices.append(new_matrix) self.replicate() def replicate(self): ''' apply all matrices to all chains. This includes the original chain as well. Is there even a need to retain the original chain name? Let's keep it at least for annotation. ''' for chain in self.source_chains: self.replicated_chains[chain] = self._replicateChain(chain) def _replicateChain(self, chain): ''' apply all applicable transformations to one chain and return all resulting copies ''' atoms = self.original_chains[chain] replicated = [] for i, matrix in enumerate(self.matrices): transformed_atoms = [] for atom in atoms: coords = [] for a,b,c,d in matrix: coords.append(a*atom.x + b*atom.y + c*atom.z + d) t = atom.transformed(*coords) transformed_atoms.append(t) replicated.append(transformed_atoms) return replicated def __len__(self): ''' calculate the number of chains that will be present after replication. ''' return len(self.matrices) * len(self.source_chains) class BioMolecule(object): ''' represents one biomolecule. since the main action is in the ReplicationGroup class, the parsing here is rather dumb. the main job here is to delegate to one or more ReplicationGroups and later to merge their output as needed. ''' title_template = 'Multimer expanded from BIOMT matrix in pdb file %s' header_line_template = 'new chain: %s from original chain: %s atoms: %s - %s' def __init__(self, bm_lines, all_chains): ''' instantiated for every block of biomt lines. chains is a references to the dictionary with all chains available in the entire pdb file, from which we will extract what is necessary. ''' self.all_chains = all_chains self.replication_groups = [] self.bm_lines = bm_lines self.collected_chains = [] in_group = False group = [] for line in bm_lines: if not line.strip(): continue if line.startswith(group_marker): in_group = True if group: self.replication_groups.append(ReplicationGroup(group, self.all_chains)) group = [] if in_group: group.append(line) if group: # finish up last one self.replication_groups.append(ReplicationGroup(group, self.all_chains)) def process(self): ''' replicate all groups and collect the output. chain naming: How do we do it? As before - if we have no more than 26 chains, we name each chain separately, else we retain the original names. also, for each chain, we output the atom range. ''' # generator for next running atom number - # used across all chains and hetatms atom_count = (x for x in xrange(1, sys.maxint)) num_chains = 0 for rg in self.replication_groups: num_chains += len(rg) if num_chains <= 26: new_chains = (x for x in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') chain_name = lambda oldchain: new_chains.next() else: chain_name = lambda oldchain: oldchain for rg in self.replication_groups: for old_chain, chains in sorted(rg.replicated_chains.items()): for chain in chains: lst = [] new_chain = chain_name(old_chain) for atom in chain: atom_no = atom_count.next() filled = atom % (atom_no, new_chain) lst.append((atom_no, filled)) self.collected_chains.append((old_chain, new_chain, \ lst[0][0], lst[-1][0], \ [l[1] for l in lst])) def output(self, filename='stuff'): ''' return our collected wisdom in one big string. This will be the main part of the output pdb file. ''' self.process() header = [self.title_template % filename] header.append('by MakeMultimer.py (%s)' % url) header.append('') header.append('BIOMT instructions applied:') header.extend(self.bm_lines) header.append('') atoms_out = [] for old, new, first, last, atoms in self.collected_chains: header.append(self.header_line_template % (new, old, first, last)) atoms_out.extend(atoms) header.append('') out = ['REMARK ' + h for h in header] + atoms_out return '\n'.join(out) class PdbReplicator(object): ''' responsible for slicing up one pdb file into biomolecules. the actual replication occurs elsewhere. ''' def __init__(self, pdb_string, handle_hetatm = "keep"): # generator for next running atom number - used across all chains and hetatms self.Number = (x for x in xrange(1, sys.maxint)) self.pdb_lines = pdb_string.splitlines() self.handle_hetatm = handle_hetatm self.originalchains = self.parseMolecule() self.output = [] self.biomolecules = self.parseBiomt() def parseBiomt(self): ''' carve up the file according to biomolecules, which are defined in the REMARK 350 lines. ''' bm_lines = [l[10:].strip() for l in self.pdb_lines if l.startswith('REMARK 350')] if not bm_lines: # this file doesn't have any biomt instructions for us. raise PdbError, 'input file does not contain any BIOMT instructions' # OK, try to carve up the biomolecules biomolecules = [] in_group = False group = [] bm_marker = 'BIOMOLECULE:' for line in bm_lines: if line.startswith(bm_marker): # start new group in_group = True if group: biomolecules.append(BioMolecule(group, self.originalchains)) group = [] if in_group and line.strip(): group.append(line) if group: biomolecules.append(BioMolecule(group, self.originalchains)) return biomolecules def parseMolecule(self): ''' extract all chains from a pdb file. Problem: HETATM records may or may not have a chain identifiers. we will look at handling those unnamed hetero atoms again later, once we get a better feel for the entire thing. ''' atom_lines = [l for l in self.pdb_lines if l.startswith('ATOM')] hetatm_lines = [l for l in self.pdb_lines if l.startswith('HETATM')] for hl in hetatm_lines: orig_chain = hl[21:22] if orig_chain.strip(): # keep only heteroatoms that have a chain atom_lines.append(hl) atoms = [Atom(al) for al in atom_lines] chains = {} for a in atoms: chains.setdefault(a.chain, []).append(a) return chains if __name__ == '__main__': try: infile = sys.argv[1] except IndexError: print usage sys.exit() if os.path.isfile(infile): pdblurb = open(infile).read() else: pdblurb = load_remote_pdb(infile) r = PdbReplicator(pdblurb) outfile_template = infile.split('.')[0] + '_mm%s.pdb' for i, bm in enumerate(r.biomolecules): outfile = outfile_template % (i+1) open(outfile, 'w').write(bm.output(infile))