Submit a Dataset
In this tutorial you'll learn how to submit a dataset to a QCFractal instance (also called backend or server).
import os
import dotenv
import random
from tqdm.auto import tqdm
import datamol as dm
import pandas as pd
from openff.toolkit import Molecule
import qcelemental as qcel
from qcportal import PortalClient
from qcportal.record_models import PriorityEnum
from qcportal.singlepoint.dataset_models import SinglepointDatasetNewEntry
from qcportal.singlepoint.record_models import QCSpecification
from qcportal.singlepoint.record_models import SinglepointDriver
_ = dotenv.load_dotenv("../../openfractal_test_secrets.env")
Prepare the dataset¶
First we pull a random subset of datamol.data.freesolv()
, create openff.Molecule
objects from it and generate one conformer per molecule.
def get_toy_molecules(
n_molecules: int = 10,
progress: bool = True,
progress_leave: bool = False,
):
# Get some data
data = dm.data.freesolv()
data = data.sample(n=n_molecules)
def _process(smiles):
# Convert to OFF mol
mol = Molecule.from_smiles(smiles)
assert mol is not None
# Generate a conformer
mol.generate_conformers(n_conformers=1)
return mol
# Generate conformers
data["mol"] = dm.parallelized(
_process,
data["smiles"],
progress=progress,
tqdm_kwargs=dict(leave=progress_leave),
)
data = data.reset_index(drop=True)
return data
data = get_toy_molecules(n_molecules=10)
data
0%| | 0/10 [00:00<?, ?it/s]
iupac | smiles | expt | calc | mol | |
---|---|---|---|---|---|
0 | pyridine | c1ccncc1 | -4.69 | -3.508 | Molecule with name '' and SMILES '[H][c]1[n][c... |
1 | ethyl propanoate | CCC(=O)OCC | -2.68 | -3.221 | Molecule with name '' and SMILES '[H][C]([H])(... |
2 | 2-methylpyridine | Cc1ccccn1 | -4.63 | -3.501 | Molecule with name '' and SMILES '[H][c]1[n][c... |
3 | 2,3-dichlorodibenzo-p-dioxin | c1ccc2c(c1)Oc3cc(c(cc3O2)Cl)Cl | -3.56 | -3.590 | Molecule with name '' and SMILES '[H][c]1[c]([... |
4 | 1-acetoxyethyl acetate | CC(OC(=O)C)OC(=O)C | -4.97 | -8.006 | Molecule with name '' and SMILES '[H][C]([H])(... |
5 | 2-methylhexane | CCCCC(C)C | 2.93 | 2.894 | Molecule with name '' and SMILES '[H][C]([H])(... |
6 | simazine | CCNc1nc(nc(n1)Cl)NCC | -10.22 | -10.914 | Molecule with name '' and SMILES '[H][N]([c]1[... |
7 | triethylphosphate | CCOP(=O)(OCC)OCC | -7.50 | -10.251 | Molecule with name '' and SMILES '[H][C]([H])(... |
8 | methylcyclopentane | CC1CCCC1 | 1.59 | 1.785 | Molecule with name '' and SMILES '[H][C]([H])(... |
9 | hept-1-yne | CCCCCC#C | 0.60 | 0.639 | Molecule with name '' and SMILES '[H][C]#[C][C... |
Initialize the PortalClient
¶
The client object will allow to interact with any QCfractal instance.
client = PortalClient(
address="https://openfractal-test-pgzbs3yryq-uc.a.run.app",
username=os.environ["OPENFRACTAL_USER_1_USERNAME"],
password=os.environ["OPENFRACTAL_USER_1_PASSWORD"],
)
client
PortalClient
- Server: openfractal-test
- Address: https://openfractal-test-pgzbs3yryq-uc.a.run.app/
- Username: admin_default
You can display some general informations about this instance:
client.server_info
{'name': 'openfractal-test', 'manager_heartbeat_frequency': 10, 'manager_heartbeat_max_missed': 5, 'version': '0.50b12.post16+gee831184', 'api_limits': {'get_records': 1000, 'add_records': 500, 'get_dataset_entries': 2000, 'get_molecules': 1000, 'add_molecules': 1000, 'get_managers': 1000, 'manager_tasks_claim': 200, 'manager_tasks_return': 10, 'get_server_stats': 25, 'get_access_logs': 1000, 'get_error_logs': 100, 'get_internal_jobs': 1000}, 'client_version_lower_limit': '0.50b11', 'client_version_upper_limit': '1', 'manager_version_lower_limit': '0.50b11', 'manager_version_upper_limit': '1', 'motd': ''}
Create a new dataset on the server¶
# Generate a random suffix for your dataset
dataset_suffix = "".join(random.choices([str(i) for i in range(10)], k=10))
dataset_name = f"dataset_demo_{dataset_suffix}"
dataset_name
'dataset_demo_4321690179'
kwargs = {}
kwargs["dataset_type"] = "singlepoint"
kwargs["name"] = dataset_name
kwargs["description"] = "my great dataset!"
# the tag allows you to restrict this dataset to only specific managers
kwargs["tags"] = ["demo_tutorial"]
kwargs["group"] = None
kwargs["provenance"] = {}
kwargs["visibility"] = True
kwargs["default_tag"] = "demo_tutorial"
kwargs["default_priority"] = PriorityEnum.normal
kwargs["metadata"] = {}
kwargs["owner_group"] = None
ds = client.add_dataset(**kwargs)
ds.dict()
{'id': 5, 'dataset_type': 'singlepoint', 'name': 'dataset_demo_4321690179', 'description': 'my great dataset!', 'tagline': '', 'tags': ['demo_tutorial'], 'group': 'default', 'visibility': True, 'provenance': {}, 'default_tag': 'demo_tutorial', 'default_priority': <PriorityEnum.normal: 1>, 'owner_user': 'admin_default', 'owner_group': None, 'metadata': {}, 'extras': {}, 'entry_names_': [], 'specifications_': {}, 'entries_': {}, 'record_map_': {}, 'contributed_values_': None, 'auto_fetch_missing': True}
Build "entries" from the dataset¶
An entry is a single data point object that hold a 3D atomistic system (also called a molecule). You can associate custom attributes to a given molecule.
chunk_size = 5
progress = True
progress_leave = False
def _create_entry(i, row):
kwargs = {}
kwargs["name"] = f"mol_{i}"
kwargs["molecule"] = row["mol"].to_qcschema()
kwargs["additional_keywords"] = {}
kwargs["attributes"] = row.drop("mol").to_dict()
kwargs["comment"] = None
return SinglepointDatasetNewEntry(**kwargs)
# We build and send the entry by chunk in case of large dataset
for i in tqdm(range(0, len(data), chunk_size)):
# Get the rows
rows = data.iloc[i : i + chunk_size]
# Build the entries
entries = dm.parallelized(
_create_entry,
rows.iterrows(),
arg_type="args",
total=len(rows),
progress=progress,
tqdm_kwargs=dict(leave=progress_leave),
)
# Send the entries to the server
insert_md = ds.add_entries(entries)
assert insert_md.success
0%| | 0/2 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/5 [00:00<?, ?it/s]
You can check the newly submitted entries:
list(ds.iterate_entries())
[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='C5H5N', formula='C5H5N', hash='a6c6d91'), additional_keywords={}, attributes={'calc': -3.508, 'expt': -4.69, 'iupac': 'pyridine', 'smiles': 'c1ccncc1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C5H10O2', formula='C5H10O2', hash='7d0fe6d'), additional_keywords={}, attributes={'calc': -3.221, 'expt': -2.68, 'iupac': 'ethyl propanoate', 'smiles': 'CCC(=O)OCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C6H7N', formula='C6H7N', hash='7739509'), additional_keywords={}, attributes={'calc': -3.501, 'expt': -4.63, 'iupac': '2-methylpyridine', 'smiles': 'Cc1ccccn1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C12Cl2H6O2', formula='C12Cl2H6O2', hash='c785c43'), additional_keywords={}, attributes={'calc': -3.59, 'expt': -3.56, 'iupac': '2,3-dichlorodibenzo-p-dioxin', 'smiles': 'c1ccc2c(c1)Oc3cc(c(cc3O2)Cl)Cl'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_4', molecule=Molecule(name='C6H10O4', formula='C6H10O4', hash='7de2fa5'), additional_keywords={}, attributes={'calc': -8.006, 'expt': -4.97, 'iupac': '1-acetoxyethyl acetate', 'smiles': 'CC(OC(=O)C)OC(=O)C'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_5', molecule=Molecule(name='C7H16', formula='C7H16', hash='31ac45a'), additional_keywords={}, attributes={'calc': 2.894, 'expt': 2.93, 'iupac': '2-methylhexane', 'smiles': 'CCCCC(C)C'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_6', molecule=Molecule(name='C7ClH12N5', formula='C7ClH12N5', hash='ab02e52'), additional_keywords={}, attributes={'calc': -10.914, 'expt': -10.22, 'iupac': 'simazine', 'smiles': 'CCNc1nc(nc(n1)Cl)NCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_7', molecule=Molecule(name='C6H15O4P', formula='C6H15O4P', hash='3f198c7'), additional_keywords={}, attributes={'calc': -10.251, 'expt': -7.5, 'iupac': 'triethylphosphate', 'smiles': 'CCOP(=O)(OCC)OCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_8', molecule=Molecule(name='C6H12', formula='C6H12', hash='bc0c2a1'), additional_keywords={}, attributes={'calc': 1.785, 'expt': 1.59, 'iupac': 'methylcyclopentane', 'smiles': 'CC1CCCC1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_9', molecule=Molecule(name='C7H12', formula='C7H12', hash='f9566b0'), additional_keywords={}, attributes={'calc': 0.639, 'expt': 0.6, 'iupac': 'hept-1-yne', 'smiles': 'CCCCCC#C'}, comment=None, local_results=None)]
Create the QM specification¶
The QM specification defines a QM protocol that will be executed on a Dataset.
Here we choose a cheap level of theory: hf/sto-3g
.
## PSI4 SCF properties
## See https://psicode.org/psi4manual/master/oeprop.html#id2
scf_properties = [
"MBIS_CHARGES",
"WIBERG_LOWDIN_INDICES",
"MAYER_INDICES",
"LOWDIN_CHARGES",
"DIPOLE",
"QUADRUPOLE",
]
## Build the protocols
## One of: all, none or orbitals_and_eigenvalues
protocols = {"wavefunction": "all"}
## Build the specification
kwargs = {}
kwargs["program"] = "psi4"
kwargs["driver"] = SinglepointDriver.gradient
# kwargs["method"] = "wb97m-d3bj"
# kwargs["basis"] = "def2-tzvppd"
kwargs["method"] = "hf"
kwargs["basis"] = "sto-3g"
kwargs["keywords"] = {"wcombine": False, "scf_properties": scf_properties}
kwargs["protocols"] = protocols
specification = QCSpecification(**kwargs)
specification
QCSpecification(program='psi4', driver=<SinglepointDriver.gradient: 'gradient'>, method='hf', basis='sto-3g', keywords={'wcombine': False, 'scf_properties': ['MBIS_CHARGES', 'WIBERG_LOWDIN_INDICES', 'MAYER_INDICES', 'LOWDIN_CHARGES', 'DIPOLE', 'QUADRUPOLE']}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.all: 'all'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))
Now we associate this QM specification (protocol) to the dataset we created above to the server.
kwargs = {}
kwargs["name"] = "simple_qm_calculation_demo"
kwargs["specification"] = specification
kwargs["description"] = None
insert_md = ds.add_specification(**kwargs)
assert insert_md
list(ds.iterate_entries())
[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='C5H5N', formula='C5H5N', hash='a6c6d91'), additional_keywords={}, attributes={'calc': -3.508, 'expt': -4.69, 'iupac': 'pyridine', 'smiles': 'c1ccncc1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C5H10O2', formula='C5H10O2', hash='7d0fe6d'), additional_keywords={}, attributes={'calc': -3.221, 'expt': -2.68, 'iupac': 'ethyl propanoate', 'smiles': 'CCC(=O)OCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C6H7N', formula='C6H7N', hash='7739509'), additional_keywords={}, attributes={'calc': -3.501, 'expt': -4.63, 'iupac': '2-methylpyridine', 'smiles': 'Cc1ccccn1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C12Cl2H6O2', formula='C12Cl2H6O2', hash='c785c43'), additional_keywords={}, attributes={'calc': -3.59, 'expt': -3.56, 'iupac': '2,3-dichlorodibenzo-p-dioxin', 'smiles': 'c1ccc2c(c1)Oc3cc(c(cc3O2)Cl)Cl'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_4', molecule=Molecule(name='C6H10O4', formula='C6H10O4', hash='7de2fa5'), additional_keywords={}, attributes={'calc': -8.006, 'expt': -4.97, 'iupac': '1-acetoxyethyl acetate', 'smiles': 'CC(OC(=O)C)OC(=O)C'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_5', molecule=Molecule(name='C7H16', formula='C7H16', hash='31ac45a'), additional_keywords={}, attributes={'calc': 2.894, 'expt': 2.93, 'iupac': '2-methylhexane', 'smiles': 'CCCCC(C)C'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_6', molecule=Molecule(name='C7ClH12N5', formula='C7ClH12N5', hash='ab02e52'), additional_keywords={}, attributes={'calc': -10.914, 'expt': -10.22, 'iupac': 'simazine', 'smiles': 'CCNc1nc(nc(n1)Cl)NCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_7', molecule=Molecule(name='C6H15O4P', formula='C6H15O4P', hash='3f198c7'), additional_keywords={}, attributes={'calc': -10.251, 'expt': -7.5, 'iupac': 'triethylphosphate', 'smiles': 'CCOP(=O)(OCC)OCC'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_8', molecule=Molecule(name='C6H12', formula='C6H12', hash='bc0c2a1'), additional_keywords={}, attributes={'calc': 1.785, 'expt': 1.59, 'iupac': 'methylcyclopentane', 'smiles': 'CC1CCCC1'}, comment=None, local_results=None), SinglepointDatasetEntry(name='mol_9', molecule=Molecule(name='C7H12', formula='C7H12', hash='f9566b0'), additional_keywords={}, attributes={'calc': 0.639, 'expt': 0.6, 'iupac': 'hept-1-yne', 'smiles': 'CCCCCC#C'}, comment=None, local_results=None)]
Submit the computation¶
Warning: once you have submitted a specification to a dataset, the compatible managers will start picking up jobs and perform the QM calculcations.
ds.submit()
Check the submission worked.
print(ds.status_table())
specification complete running -------------------------- ---------- --------- simple_qm_calculation_demo 4 6
Monitoring¶
Retrieve a dataset by its name.
ds = client.get_dataset("singlepoint", dataset_name)
ds
SinglepointDataset(id=5, dataset_type='singlepoint', name='dataset_demo_4321690179', description='my great dataset!', tagline='', tags=['demo_tutorial'], group='default', visibility=True, provenance={}, default_tag='demo_tutorial', default_priority=<PriorityEnum.normal: 1>, owner_user='admin_default', owner_group=None, metadata={}, extras={}, entry_names_=[], specifications_={}, entries_={}, record_map_={}, contributed_values_=None, auto_fetch_missing=True)
Print a table showing the status for a dataset.
print(ds.status_table())
specification complete running -------------------------- ---------- --------- simple_qm_calculation_demo 2 8
Read the records (some might be completed but some might still be in progress or in failing state.
records_list = []
for r in tqdm(client.query_records(dataset_id=ds.id)):
# Access this object to fetch the potential errors when any
r.error
records_list.append(r.dict())
records = pd.DataFrame(records_list)
records = records.sort_values("id")
records = records.reset_index(drop=True)
records
0it [00:00, ?it/s]
id | record_type | is_service | properties | extras | status | manager_name | created_on | modified_on | owner_user | owner_group | compute_history_ | task_ | service_ | comments_ | native_files_ | specification | molecule_id | molecule_ | wavefunction_ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15 | singlepoint | False | {'pe energy': 0.0, 'scf dipole': [-0.000133218... | {} | RecordStatusEnum.complete | manager_hadrien_local_1-gollum-ef38f2ac-b99a-4... | 2023-06-23 00:01:50.894704 | 2023-06-23 00:03:46.557663 | admin_default | None | [{'id': 15, 'record_id': 15, 'status': 'Record... | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 24 | None | None |
1 | 52 | singlepoint | False | {'pe energy': 0.0, 'scf dipole': [0.1806203110... | {} | RecordStatusEnum.complete | manager_hadrien_local_1-gollum-ef38f2ac-b99a-4... | 2023-06-23 00:01:50.894737 | 2023-06-23 00:06:23.031904 | admin_default | None | [{'id': 52, 'record_id': 52, 'status': 'Record... | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 37 | None | None |
2 | 269 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599294 | 2023-06-23 13:10:14.774596 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 131 | None | None |
3 | 270 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599299 | 2023-06-23 13:10:14.774608 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 127 | None | None |
4 | 271 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599300 | 2023-06-23 13:10:14.774614 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 130 | None | None |
5 | 272 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599301 | 2023-06-23 13:10:14.774620 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 133 | None | None |
6 | 273 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599302 | 2023-06-23 13:10:14.774626 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 129 | None | None |
7 | 274 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599303 | 2023-06-23 13:10:14.774632 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 126 | None | None |
8 | 275 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599303 | 2023-06-23 13:10:14.774638 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 132 | None | None |
9 | 276 | singlepoint | False | None | None | RecordStatusEnum.running | manager_hadrien_local_1-boromir-eb6f7b1c-7db1-... | 2023-06-23 13:09:57.599304 | 2023-06-23 13:10:14.774644 | admin_default | None | [] | None | None | None | None | {'program': 'psi4', 'driver': 'SinglepointDriv... | 128 | None | None |
Delete a dataset and associated records¶
Important: Before deleting the dataset you just created, you should check the other tutorials where you'll learn how to launch a manager than can perform the QM calculations submitted above!
Retrieve the ID of a given dataset.
client.list_datasets()
[{'id': 1, 'dataset_type': 'singlepoint', 'dataset_name': 'dataset_1'}, {'id': 3, 'dataset_type': 'singlepoint', 'dataset_name': 'dataset_2'}]
dataset_id = client.get_dataset("singlepoint", "dataset_demo_4321690179").id
dataset_id
5
Delete the dataset and its associated records.
Warning: this step can't be reversed.
client.delete_dataset(dataset_id, delete_records=True)