Chemistry
[19]:
from molgraph import chemistry
import tensorflow as tf
import matplotlib.pyplot as plt
[20]:
# Convert SMILES representation of molecule to an RDKit molecule
rdkit_mol = chemistry.molecule_from_string(
'OCC1OC(C(C1O)O)n1cnc2c1ncnc2N')
print(type(rdkit_mol))
<class 'rdkit.Chem.rdchem.Mol'>
chemistry.features (level 1)
[24]:
# Obtain RDKit atom and bond via the RDKit API
atom = rdkit_mol.GetAtoms()[0]
bond = rdkit_mol.GetBonds()[0]
symbol_feature = chemistry.features.Symbol()
bondtype_feature = chemistry.features.BondType()
print('Symbol: ', symbol_feature(atom))
print('BondType:', bondtype_feature(bond))
Symbol: O
BondType: SINGLE
Arguments of a chemistry.Feature
allowable_setspecifies what features should be consideredoov_sizespecifies the number of bins alloted to “out-of-vocabulary” features (based onallowable_set)ordinalspecifies if encoding should be ordinal or not (nominal)
Importantly, the arguments above will only take effect when wrapped in a chemistry.Encoding, which occur automatically inside a chemistry.Featurizer or chemistry.Tokenizer.
[25]:
print(chemistry.features.BondType())
print(chemistry.features.BondType(allowable_set=['SINGLE', 'DOUBLE'], oov_size=1, ordinal=False))
BondType(allowable_set={'DOUBLE', 'TRIPLE', 'AROMATIC', 'SINGLE'}, ordinal=False, oov_size=0)
BondType(allowable_set=['SINGLE', 'DOUBLE'], ordinal=False, oov_size=1)
chemistry.Featurizer (level 2)
A chemistry.Featurizer (or chemistry.Tokenizer) can be built from a list of chemistry.Features. Note: if ordinal=False (default), allowable_set will be sorted internally via sort().
[26]:
atom_encoder = chemistry.Featurizer([
chemistry.features.Symbol(['C', 'N', 'O'], oov_size=1),
chemistry.features.Hybridization(['SP', 'SP2', 'SP3'], oov_size=1)
])
print('Symbol: ', atom.GetSymbol())
print('Hybridization:', atom.GetHybridization().name)
# first and fifth bin is alloted to OOVs
print(atom_encoder(atom))
Symbol: O
Hybridization: SP3
[0. 0. 0. 1. 0. 0. 0. 1.]
Create a custom chemistry.Feature
[27]:
class AtomMass(chemistry.Feature):
def __call__(self, x):
mass = x.GetMass()
if mass < 5:
return 'x<5'
elif mass < 40:
return '5<x<40'
else:
return '40<x'
featurizer = chemistry.Featurizer([
AtomMass({'x<5', '5<x<40', '40<x'}),
chemistry.features.Symbol({'C', 'N'})
])
print('AtomMass:', atom.GetMass())
print('Symbol: ', atom.GetSymbol())
print(featurizer(atom))
AtomMass: 15.999
Symbol: O
[0. 1. 0. 0. 0.]
chemistry.MolecularGraphEncoder (level 3)
chemistry.MolecularGraphEncoder encodes inputted molecule(s) as molecular graphs, namely as a GraphTensor.
Below a single molecule is encoded as a GraphTensor
[29]:
atom_encoder = chemistry.Featurizer([
chemistry.features.Symbol(allowable_set={'C', 'N', 'O'})
])
bond_encoder = chemistry.Featurizer([
chemistry.features.BondType(allowable_set={'SINGLE', 'DOUBLE'})
])
mol_encoder = chemistry.MolecularGraphEncoder(
atom_encoder=atom_encoder, # not default, required
bond_encoder=bond_encoder, # not default, optional
positional_encoding_dim=16, # default
self_loops=False, # default
molecule_from_string_fn=chemistry.molecule_from_string # default
)
print(mol_encoder(rdkit_mol))
GraphTensor(
sizes=<tf.Tensor: shape=(), dtype=int64>,
node_feature=<tf.Tensor: shape=(19, 3), dtype=float32>,
edge_src=<tf.Tensor: shape=(42,), dtype=int32>,
edge_dst=<tf.Tensor: shape=(42,), dtype=int32>,
edge_feature=<tf.Tensor: shape=(42, 2), dtype=float32>,
node_position=<tf.Tensor: shape=(19, 16), dtype=float32>)
Here, a list of molecules are encoded as a GraphTensor
[32]:
smiles = [
'OCC1OC(C(C1O)O)n1cnc2c1ncnc2N',
'C(C(=O)O)N',
'C1=CC(=CC=C1CC(C(=O)O)N)O'
]
# Uses multiprocessing by default
graph_tensor = mol_encoder(smiles, processes=8)
print(graph_tensor)
GraphTensor(
sizes=<tf.Tensor: shape=(3,), dtype=int64>,
node_feature=<tf.Tensor: shape=(37, 3), dtype=float32>,
edge_src=<tf.Tensor: shape=(76,), dtype=int32>,
edge_dst=<tf.Tensor: shape=(76,), dtype=int32>,
edge_feature=<tf.Tensor: shape=(76, 2), dtype=float32>,
node_position=<tf.Tensor: shape=(37, 16), dtype=float32>)
Inspect generated GraphTensor
[33]:
print('node_feature:', graph_tensor.node_feature, end='\n\n')
print('edge_feature:', graph_tensor.edge_feature, end='\n\n')
print('edge_dst:', graph_tensor.edge_dst, end='\n\n')
print('edge_src:', graph_tensor.edge_src, end='\n\n')
node_feature: tf.Tensor(
[[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]], shape=(37, 3), dtype=float32)
edge_feature: tf.Tensor(
[[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]], shape=(76, 2), dtype=float32)
edge_dst: tf.Tensor(
[ 1 0 2 1 3 6 2 4 3 5 9 4 6 8 2 5 7 6 5 4 10 13 9 11
10 12 11 13 17 9 12 14 13 15 14 16 15 17 12 16 18 17 20 23 19 21 22 20
20 19 25 29 24 26 25 27 36 26 28 27 29 24 28 30 29 31 30 32 35 31 33 34
32 32 31 26], shape=(76,), dtype=int32)
edge_src: tf.Tensor(
[ 0 1 1 2 2 2 3 3 4 4 4 5 5 5 6 6 6 7 8 9 9 9 10 10
11 11 12 12 12 13 13 13 14 14 15 15 16 16 17 17 17 18 19 19 20 20 20 21
22 23 24 24 25 25 26 26 26 27 27 28 28 29 29 29 30 30 31 31 31 32 32 32
33 34 35 36], shape=(76,), dtype=int32)
Extract the second subgraph (glycine)
[35]:
from molgraph.chemistry import vis
index = 1
# visualize the second molecule of the GraphTensor as reference
vis.visualize_molecule(
molecule=smiles[index],
atom_index=True,
bond_index=True
)
[35]:
[36]:
print(graph_tensor[index], end='\n\n')
print('node_feature:', graph_tensor[index].node_feature, end='\n\n')
print('edge_feature:', graph_tensor[index].edge_feature, end='\n\n')
print('edge_dst:', graph_tensor[index].edge_dst, end='\n\n')
print('edge_src:', graph_tensor[index].edge_src, end='\n\n')
GraphTensor(
sizes=<tf.Tensor: shape=(), dtype=int64>,
node_feature=<tf.Tensor: shape=(5, 3), dtype=float32>,
edge_src=<tf.Tensor: shape=(8,), dtype=int32>,
edge_dst=<tf.Tensor: shape=(8,), dtype=int32>,
edge_feature=<tf.Tensor: shape=(8, 2), dtype=float32>,
node_position=<tf.Tensor: shape=(5, 16), dtype=float32>)
node_feature: tf.Tensor(
[[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]], shape=(5, 3), dtype=float32)
edge_feature: tf.Tensor(
[[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]], shape=(8, 2), dtype=float32)
edge_dst: tf.Tensor([1 4 0 2 3 1 1 0], shape=(8,), dtype=int32)
edge_src: tf.Tensor([0 0 1 1 1 2 3 4], shape=(8,), dtype=int32)
Separate GraphTensor (its subgraphs) into separate rows
[37]:
graph_tensor = graph_tensor.separate()
print(graph_tensor, end='\n\n')
print('node_feature:', graph_tensor.node_feature, end='\n\n')
print('edge_feature:', graph_tensor.edge_feature, end='\n\n')
print('edge_dst:', graph_tensor.edge_dst, end='\n\n')
print('edge_src:', graph_tensor.edge_src, end='\n\n')
print('graph_indicator:', graph_tensor.graph_indicator)
GraphTensor(
sizes=<tf.Tensor: shape=(3,), dtype=int64>,
node_feature=<tf.RaggedTensor: shape=(3, None, 3), dtype=float32, ragged_rank=1>,
edge_src=<tf.RaggedTensor: shape=(3, None), dtype=int32, ragged_rank=1>,
edge_dst=<tf.RaggedTensor: shape=(3, None), dtype=int32, ragged_rank=1>,
edge_feature=<tf.RaggedTensor: shape=(3, None, 2), dtype=float32, ragged_rank=1>,
node_position=<tf.RaggedTensor: shape=(3, None, 16), dtype=float32, ragged_rank=1>)
node_feature: <tf.RaggedTensor [[[0.0, 0.0, 1.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 1.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 1.0],
[0.0, 0.0, 1.0],
[0.0, 1.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0]], [[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 1.0],
[0.0, 0.0, 1.0],
[0.0, 1.0, 0.0]], [[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 1.0],
[0.0, 0.0, 1.0],
[0.0, 1.0, 0.0],
[0.0, 0.0, 1.0]]]>
edge_feature: <tf.RaggedTensor [[[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 1.0],
[0.0, 1.0]], [[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[1.0, 0.0],
[0.0, 1.0],
[1.0, 0.0],
[0.0, 1.0],
[0.0, 1.0]], [[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 1.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 0.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0],
[1.0, 0.0],
[0.0, 1.0],
[1.0, 0.0],
[0.0, 1.0],
[0.0, 1.0],
[0.0, 1.0]]]>
edge_dst: <tf.RaggedTensor [[1, 0, 2, 1, 3, 6, 2, 4, 3, 5, 9, 4, 6, 8, 2, 5, 7, 6, 5, 4, 10, 13, 9, 11,
10, 12, 11, 13, 17, 9, 12, 14, 13, 15, 14, 16, 15, 17, 12, 16, 18, 17] ,
[1, 4, 0, 2, 3, 1, 1, 0],
[1, 5, 0, 2, 1, 3, 12, 2, 4, 3, 5, 0, 4, 6, 5, 7, 6, 8, 11, 7, 9, 10, 8, 8,
7, 2] ]>
edge_src: <tf.RaggedTensor [[0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 8, 9, 9, 9, 10, 10,
11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18] ,
[0, 0, 1, 1, 1, 2, 3, 4],
[0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9, 10,
11, 12] ]>
graph_indicator: None
Merge GraphTensor (its subgraphs) into a single disjoint graph
[38]:
graph_tensor = graph_tensor.merge()
print(graph_tensor, end='\n\n')
print('node_feature:', graph_tensor.node_feature, end='\n\n')
print('edge_feature:', graph_tensor.edge_feature, end='\n\n')
print('edge_dst:', graph_tensor.edge_dst, end='\n\n')
print('edge_src:', graph_tensor.edge_src, end='\n\n')
GraphTensor(
sizes=<tf.Tensor: shape=(3,), dtype=int64>,
node_feature=<tf.Tensor: shape=(37, 3), dtype=float32>,
edge_src=<tf.Tensor: shape=(76,), dtype=int32>,
edge_dst=<tf.Tensor: shape=(76,), dtype=int32>,
edge_feature=<tf.Tensor: shape=(76, 2), dtype=float32>,
node_position=<tf.Tensor: shape=(37, 16), dtype=float32>)
node_feature: tf.Tensor(
[[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]], shape=(37, 3), dtype=float32)
edge_feature: tf.Tensor(
[[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]], shape=(76, 2), dtype=float32)
edge_dst: tf.Tensor(
[ 1 0 2 1 3 6 2 4 3 5 9 4 6 8 2 5 7 6 5 4 10 13 9 11
10 12 11 13 17 9 12 14 13 15 14 16 15 17 12 16 18 17 20 23 19 21 22 20
20 19 25 29 24 26 25 27 36 26 28 27 29 24 28 30 29 31 30 32 35 31 33 34
32 32 31 26], shape=(76,), dtype=int32)
edge_src: tf.Tensor(
[ 0 1 1 2 2 2 3 3 4 4 4 5 5 5 6 6 6 7 8 9 9 9 10 10
11 11 12 12 12 13 13 13 14 14 15 15 16 16 17 17 17 18 19 19 20 20 20 21
22 23 24 24 25 25 26 26 26 27 27 28 28 29 29 29 30 30 31 31 31 32 32 32
33 34 35 36], shape=(76,), dtype=int32)
[ ]: