import numpy as np
from rdkit import Chem
import re
import cirpy
import requests
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.Descriptors import ExactMolWt
from pubchempy import Compound, get_compounds
from molmass import Formula
[docs]
def get_classyfire(smiles, if_np=False):
"""
Retrieves the ClassyFire classification for a given SMILES string.
Args:
smiles (str): The SMILES string of the molecule to classify.
if_np (bool, optional): A flag indicating whether the molecule is a natural product. Defaults to False.
Returns:
dict: The JSON response from the ClassyFire API if the request is successful,
otherwise numpy.NAN.
"""
url = create_classyfire_url(smiles, if_np)
r = requests.get(url)
if r.ok:
return r.json()
else:
return np.nan
[docs]
def everything_to_smiles(input):
"""
Convert various chemical identifier formats to a SMILES string.
This function takes an input which can be in different chemical identifier formats
(SMILES, Mol, InChIKey, CAS number, or chemical name) and converts it to a SMILES string.
Args:
input (str or RDKit Mol): The chemical identifier to be converted. It can be a SMILES string, an RDKit Mol object, an InChIKey, a CAS number, or a chemical name.
Returns:
smiles (str): The corresponding SMILES string if the conversion is successful. Returns NaN if the input is NaN.
"""
if input != input:# check for nan
return np.nan
if is_smiles(input):
smiles = input
elif is_mol(input):
smiles = Chem.MolToSmiles(input)
elif is_inchikey(input):
smiles = inchikey_to_smiles(input)
elif is_cas_number(input):
smiles = cas_to_smiles(input)
if smiles != smiles:
smiles = name_to_smiles(input)
else:
smiles = name_to_smiles(input)
return(smiles)
[docs]
def everything_to_inchikey(input, first_block = True):
"""
Converts various chemical identifiers to an InChIKey or its first block.
This function takes an input which can be an InChIKey, a molecule object, a SMILES string,
a CAS number, or a chemical name, and converts it to an InChIKey. If the input is already
an InChIKey, it can return either the full InChIKey or just the first block of it based
on the `first_block` parameter.
Args:
input (str or RDKit Mol): The chemical identifier to be converted. It can be an InChIKey, a molecule object, a SMILES string, a CAS number, or a chemical name.
first_block (bool, optional): If True, returns only the first block of the InChIKey. Defaults to True.
Returns:
inchikey (str): The InChIKey or its first block if `first_block` is True. Returns NaN if the
input is invalid or cannot be converted.
"""
smiles = np.nan
if input != input:# check for nan
return np.nan
if is_inchikey(input):
if first_block ==True:
return input[0:14]
else:
return input
elif is_mol(input):
smiles = Chem.MolToSmiles(input)
elif is_smiles(input):
smiles = input
elif is_cas_number(input):
smiles = cas_to_smiles(input)
if smiles != smiles:
smiles = name_to_smiles(input)
else:
smiles = name_to_smiles(input)
if smiles == smiles:
mol = Chem.MolFromSmiles(smiles)
inchikey = Chem.MolToInchiKey(mol)
if first_block ==True:
return inchikey[0:14]
return inchikey
else:
return np.nan
[docs]
def create_classyfire_url(smiles_string, if_np = True):
"""
Generates a URL for ClassyFire or NPClassifier based on the provided SMILES string. Just a helper function
"""
if if_np:
url_template = "https://npclassifier.gnps2.org/classify?smiles={}"
else:
url_template='https://structure.gnps2.org/classyfire?smiles={}'
return url_template.format(smiles_string)
[docs]
def smiles_to_inchikey(smiles):
"""
helper function
Args:
smiles (str): A SMILES string representing a molecule.
Returns:
inchikey (str): The InChIKey of the molecule, first block only.
"""
if isinstance(smiles, float):
return np.nan
mol = Chem.MolFromSmiles(smiles)
inchikey = Chem.MolToInchiKey(mol)
return(inchikey[0:14])
[docs]
def inchikey_to_smiles(inchikey):
"""
helper function, but uses pubchem database
Args:
inchikey (str): The inchikey of the molecule to look up.
Returns:
str: The fetched isomeric SMILES code.
"""
cc = get_compounds(inchikey, 'inchikey')
if len(cc)>0:
return (cc[0].isomeric_smiles)
else:
cc = get_compounds(inchikey[0:14], 'inchikey')
if len(cc)>0:
return (cc[0].isomeric_smiles)
else:
return (np.nan)
[docs]
def cas_to_smiles(cas):
"""
Convert a CAS (Chemical Abstracts Service) number to a SMILES (Simplified Molecular Input Line Entry System) string.
Args:
cas (str): The CAS number of the chemical compound.
Returns:
str: The SMILES string of the chemical compound if found, otherwise NaN.
"""
smile = cirpy.resolve(cas, 'smiles')
if smile is None:
smile = np.nan
return(smile)
[docs]
def name_to_smiles(name):
"""
Convert a chemical name to its corresponding SMILES (Simplified Molecular Input Line Entry System) representation, with Pubchem as backend.
Args:
name (str): The chemical name to be converted.
Returns:
str: The SMILES representation of the chemical if found, otherwise numpy.nan.
"""
cc = get_compounds(name, 'name')
if len(cc)>0:
return (cc[0].isomeric_smiles)
else:
return (np.nan)
[docs]
def everything_to_image(molecule, savepath):
"""
Converts a molecular representation to an image and saves it to the specified path.
Args:
molecule (str or RDKit Mol object): The molecular representation, which can be a SMILES string,
an RDKit Mol object, or any other format that can be converted to a SMILES string.
savepath (str): The file path where the generated image will be saved.
Returns:
None
"""
from rdkit import Chem
from rdkit.Chem import Draw
if is_mol(molecule):
# Create an RDKit molecule object
mol = molecule
elif is_smiles(molecule):
# print('ttt')
mol = Chem.MolFromSmiles(molecule)
else:
smiles = everything_to_smiles(molecule)
mol = Chem.MolFromSmiles(smiles)
# Generate the image of the molecule
img = Draw.MolToImage(mol)
# Save the image to a file
img.save(savepath)
#below are is_ section
[docs]
def is_inchikey(string):
"""
Check if a given string is a valid InChIKey using regex.
An InChIKey is a 27-character string divided into three blocks by hyphens:
- The first block contains 14 uppercase letters.
- The second block contains 10 uppercase letters.
- The third block contains a single uppercase letter or digit.
Args:
string (str): The string to be checked.
Returns:
bool: True if the string is a valid InChIKey, False otherwise.
"""
# Define the regex pattern for InChIKeys
pattern = r'^[A-Z]{14}-[A-Z]{10}-[A-Z0-9]$'
# Use re.match to check if the pattern matches the entire string
if re.match(pattern, string):
return True
else:
return False
[docs]
def is_mol(mol):
"""
Check if the given object is an instance of Chem.rdchem.Mol.
Args:
mol: The object to check.
Returns:
bool: True if the object is an instance of Chem.rdchem.Mol, False otherwise.
"""
return isinstance(mol, Chem.rdchem.Mol)
[docs]
def is_smiles(smiles_string):
"""
Check if a given string is a valid SMILES (Simplified Molecular Input Line Entry System) representation.
Args:
smiles_string (str): The SMILES string to be validated.
Returns:
bool: True if the SMILES string is valid, False otherwise.
Example:
>>> is_smiles("CCO")
True
>>> is_smiles("invalid_smiles")
False
"""
# Attempt to create a molecule object from the SMILES string
mol = Chem.MolFromSmiles(smiles_string)
# If the molecule object is created successfully, the SMILES string is valid
if mol is not None:
return True
else:
# If the molecule object is None, the SMILES string is invalid
return False
[docs]
def is_cas_number(string):
"""
Check if a given string is a valid CAS (Chemical Abstracts Service) number.
A CAS number is a unique numerical identifier assigned to every chemical substance
described in the open scientific literature. It is formatted as one or more digits,
followed by a hyphen, followed by two or more digits, followed by another hyphen,
and ending with a single digit.
Args:
string (str): The string to be checked.
Returns:
bool: True if the string is a valid CAS number, False otherwise.
"""
# Regex pattern for CAS numbers: one or more digits, followed by a hyphen, followed by two or more digits,
# followed by a hyphen, and ending with a single digit
pattern = r'^\d+-\d{2,}-\d$'
# Check if the given string matches the pattern
if re.match(pattern, string):
return True
else:
return False
[docs]
def everything_to_mw(mol):
"""
Converts a given molecule representation to its molecular weight (MW).
This function first checks if the input is a valid molecule object. If not, it attempts to convert the input to a SMILES string and then to a molecule object. Finally, it calculates and returns the exact molecular weight of the molecule.
Args:
mol: The input molecule representation. This can be a molecule object or another representation that can be converted to a SMILES string.
Returns:
float: The exact molecular weight of the molecule.
Raises:
ValueError: If the input cannot be converted to a valid molecule object.
"""
if is_mol(mol)==False:
smiles = everything_to_smiles(mol)
mol = Chem.MolFromSmiles(smiles)
return mol
return(ExactMolWt(mol))
# pattern = r'^([A-Z][a-z]?\d*)+$'
# if s.endswith('+') or s.endswith('-'):
# s = s[:-1]
# # Match the entire string against the pattern
# match = re.fullmatch(pattern, s)
# # If there's a match, the string is a valid chemical formula
# return bool(match)