Dataset Example
[1]:
%load_ext autoreload
%autoreload 2
[2]:
from mddb_workflow.core.dataset import Dataset
dataset_dir = '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/'
# YAML use for the configuration of the dataset and the automatic inputs.yaml generation.
# project_directories outside the dataset directory are not allowed
dataset_yaml_path = dataset_dir + "dataset.yaml"
print(dataset_yaml_path)
!cat {dataset_yaml_path}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/dataset.yaml
global:
project_directories:
- '[0]*/**/' # (matches dirs starting with a digit and all their subfolders)
[3]:
dt = Dataset(dataset_yaml_path)
# Print the project directories to verify they are correct
dt.project_directories[:5]
[3]:
['/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6jzh/',
'/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7dtd/',
'/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6oik/',
'/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6j8h/',
'/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7cmu/']
[5]:
inputs_template = dataset_dir + "inputs_template.yaml"
print(inputs_template)
!cat {inputs_template}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/inputs_template.yaml
name: "{{ title }} ({{ DIR }})"
description: "{{ title }} (1 ms)"
authors: Agustín García
groups: IRB Barcelona, Orozco lab
citation: null
thanks: null
contact: agustin.garcia@irbbarcelona.org
type: trajectory
program: GROMACS
version: 2025.2
license: This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License
linkcense: tps://creativecommons.org/licenses/by/4.0/
method: Classical MD
accession: null
links:
- name: Structural data source
url: https://memprotmd.bioch.ox.ac.uk/_ref/PDB/{{ DIR }}
pdb_ids:
- {{ DIR }}
forced_references: null
framestep: 0.01
timestep: 2
ensemble: NPT
ff: 53A6 GROMOS
wat: TIP3P
boxtype: Cubic
mds:
- mdir: replica_1
mdref: 0
interactions: null
pbc_selection: auto
collections: mcns
chainnames: null
membranes: null
customs: null
multimeric: null
trjType: large
bucket: 8d3eha
temp: 310
ligands: null
[6]:
import requests
def obtener_titulo(pdb_id):
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
r = requests.get(url)
if r.status_code == 200:
data = r.json()
return data.get('struct', {}).get('title', '').strip()
else:
raise ValueError(f"No se pudo obtener título para PDB {pdb_id}")
dt.generate_inputs_yaml(inputs_template, obtener_titulo)
[12]:
job_template=dataset_dir + "job_template.sh"
print(job_template)
!cat {job_template}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/job_template.sh
#!/bin/bash
#SBATCH --job-name={{ pdbIds[0] }}
#SBATCH --output={{ pdbIds[0] }}_%j.out
#SBATCH --error={{ pdbIds[0] }}_%j.err
#SBATCH --mem=64G
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mail-type=END,FAIL
#SBATCH --time=24:00:00
#SBATCH --mail-user=ruben.chaves@irbbarcelona.org
module load anaconda3
conda activate mwf_env
mwf run -top topology.tpr -md replica_1 trajectory.xtc -i membs -ow -nc {% if group == 3 %}-m intrajrity{% endif %}
[4]:
dt.display_status_with_links()
| state | message | group | log_file | err_file | |
|---|---|---|---|---|---|
| rel_path | |||||
| 0_to_reimage/6gdg | not_run | No output log available | 3 | ||
| 0_to_reimage/6j8h | not_run | No output log available | 3 | ||
| 0_to_reimage/6jzh | not_run | No output log available | 3 | ||
| 0_to_reimage/6k42 | not_run | No output log available | 3 | ||
| 0_to_reimage/6kux | not_run | No output log available | 3 | ||
| 0_to_reimage/6kuy | not_run | No output log available | 3 | ||
| 0_to_reimage/6ni3 | not_run | No output log available | 3 | ||
| 0_to_reimage/6nt3 | not_run | No output log available | 3 | ||
| 0_to_reimage/6oik | not_run | No output log available | 3 | ||
| 0_to_reimage/6ps5 | not_run | No output log available | 3 | ||
| 0_to_reimage/6qfa | not_run | No output log available | 3 | ||
| 0_to_reimage/6wjc | not_run | No output log available | 3 | ||
| 0_to_reimage/7bz2 | not_run | No output log available | 3 | ||
| 0_to_reimage/7cmu | not_run | No output log available | 3 | ||
| 0_to_reimage/7dhr | not_run | No output log available | 3 | ||
| 0_to_reimage/7dtd | not_run | No output log available | 3 | ||
| 0_to_reimage/7e2y | not_run | No output log available | 3 | ||
| 1_to_run/2lm2 | error | InputError: Missing input topology file "topology.tpr" | 2 | logs/mwf_7380854.out | logs/mwf_7380854.err |
| 1_to_run/3sn6 | error | InputError: Missing input topology file "topology.tpr" | 2 | logs/mwf_7380850.out | logs/mwf_7380850.err |
| 1_to_run/6i53 | error | -> Counting number of frames | 1 | logs/mwf_7380857.out | logs/mwf_7380857.err |
| 1_to_run/6me2 | error | -> Counting number of frames | 1 | logs/mwf_7380858.out | logs/mwf_7380858.err |
| 1_to_run/6me3 | error | -> Counting number of frames | 1 | logs/mwf_7380852.out | logs/mwf_7380852.err |
| 1_to_run/6me5 | error | -> Counting number of frames | 1 | logs/mwf_7380855.out | logs/mwf_7380855.err |
| 1_to_run/6ps7 | error | -> Counting number of frames | 1 | logs/mwf_7380853.out | logs/mwf_7380853.err |
| 1_to_run/6ps8 | error | -> Counting number of frames | 1 | logs/mwf_7380856.out | logs/mwf_7380856.err |
| 1_to_run/7e2z | error | InputError: Missing input topology file "topology.tpr" | 2 | logs/mwf_7380851.out | logs/mwf_7380851.err |
| 2_to_membs/6d6u | error | TestFailure: Failed to find stable bonds | 5 | logs/mwf_7451102.out | logs/mwf_7451102.err |
| 2_to_membs/6gt3 | done | Done! | 0 | logs/mwf_7451113.out | logs/mwf_7451113.err |
| 2_to_membs/6j8e | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451112.out | logs/mwf_7451112.err |
| 2_to_membs/6kr8 | done | Done! | 0 | logs/mwf_7451105.out | logs/mwf_7451105.err |
| 2_to_membs/6n4q | done | Done! | 0 | logs/mwf_7451098.out | logs/mwf_7451098.err |
| 2_to_membs/6n4r | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451104.out | logs/mwf_7451104.err |
| 2_to_membs/6nt4 | done | Done! | 0 | logs/mwf_7451108.out | logs/mwf_7451108.err |
| 2_to_membs/6oij | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451115.out | logs/mwf_7451115.err |
| 2_to_membs/6ps2 | done | Done! | 0 | logs/mwf_7451096.out | logs/mwf_7451096.err |
| 2_to_membs/6vxo | done | Done! | 0 | logs/mwf_7451116.out | logs/mwf_7451116.err |
| 2_to_membs/6w6o | done | Done! | 0 | logs/mwf_7451103.out | logs/mwf_7451103.err |
| 2_to_membs/6wgt | done | Done! | 0 | logs/mwf_7451097.out | logs/mwf_7451097.err |
| 2_to_membs/6zdv | done | Done! | 0 | logs/mwf_7451107.out | logs/mwf_7451107.err |
| 2_to_membs/7dfp | done | Done! | 0 | logs/mwf_7451099.out | logs/mwf_7451099.err |
| 2_to_membs/7dhi | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451114.out | logs/mwf_7451114.err |
| 2_to_membs/7dtc | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451111.out | logs/mwf_7451111.err |
| 2_to_membs/7eoq | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451110.out | logs/mwf_7451110.err |
| 2_to_membs/7eor | done | Done! | 0 | logs/mwf_7451117.out | logs/mwf_7451117.err |
| 2_to_membs/7eos | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451101.out | logs/mwf_7451101.err |
| 2_to_membs/7eot | done | Done! | 0 | logs/mwf_7451109.out | logs/mwf_7451109.err |
| 2_to_membs/7eou | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451095.out | logs/mwf_7451095.err |
| 2_to_membs/7jvr | error | TestFailure: RMSD check has failed: there may be sudden jumps along the trajectory | 6 | logs/mwf_7451106.out | logs/mwf_7451106.err |
| 2_to_membs/7k48 | done | Done! | 0 | logs/mwf_7451119.out | logs/mwf_7451119.err |
| 2_to_membs/7mix | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451100.out | logs/mwf_7451100.err |
| 2_to_membs/7miy | error | Running BioBB LiPyphilic ZPositions | 4 | logs/mwf_7451118.out | logs/mwf_7451118.err |
[5]:
dt.show_groups()
[5]:
| message | count | |
|---|---|---|
| group | ||
| 0 | Done! | 13 |
| 1 | -> Counting number of frames | 6 |
| 2 | InputError: Missing input topology file "topol... | 3 |
| 3 | No output log available | 17 |
| 4 | Running BioBB LiPyphilic ZPositions | 10 |
| 5 | TestFailure: Failed to find stable bonds | 1 |
| 6 | TestFailure: RMSD check has failed: there may ... | 1 |
[6]:
!mwf dataset status {dataset_yaml_path}
Group 0:
Message: Done!
Projects:
- 2_to_membs/6gt3
- 2_to_membs/6kr8
- 2_to_membs/6n4q
- 2_to_membs/6nt4
- 2_to_membs/6ps2
- 2_to_membs/6vxo
- 2_to_membs/6w6o
- 2_to_membs/6wgt
- 2_to_membs/6zdv
- 2_to_membs/7dfp
- 2_to_membs/7eor
- 2_to_membs/7eot
- 2_to_membs/7k48
Group 1:
Message: -> Counting number of frames
Projects:
- 1_to_run/6i53
- 1_to_run/6me2
- 1_to_run/6me3
- 1_to_run/6me5
- 1_to_run/6ps7
- 1_to_run/6ps8
Group 2:
Message: InputError: Missing input topology file "topology.tpr"
Projects:
- 1_to_run/2lm2
- 1_to_run/3sn6
- 1_to_run/7e2z
Group 3:
Message: No output log available
Projects:
- 0_to_reimage/6gdg
- 0_to_reimage/6j8h
- 0_to_reimage/6jzh
- 0_to_reimage/6k42
- 0_to_reimage/6kux
- 0_to_reimage/6kuy
- 0_to_reimage/6ni3
- 0_to_reimage/6nt3
- 0_to_reimage/6oik
- 0_to_reimage/6ps5
- 0_to_reimage/6qfa
- 0_to_reimage/6wjc
- 0_to_reimage/7bz2
- 0_to_reimage/7cmu
- 0_to_reimage/7dhr
- 0_to_reimage/7dtd
- 0_to_reimage/7e2y
Group 4:
Message: Running BioBB LiPyphilic ZPositions
Projects:
- 2_to_membs/6j8e
- 2_to_membs/6n4r
- 2_to_membs/6oij
- 2_to_membs/7dhi
- 2_to_membs/7dtc
- 2_to_membs/7eoq
- 2_to_membs/7eos
- 2_to_membs/7eou
- 2_to_membs/7mix
- 2_to_membs/7miy
Group 5:
Message: TestFailure: Failed to find stable bonds
Projects:
- 2_to_membs/6d6u
Group 6:
Message: TestFailure: RMSD check has failed: there may be sudden jumps along the trajectory
Projects:
- 2_to_membs/7jvr
[ ]:
# To launch the workflow with SLURM
dt.launch_workflow(
include_groups=[3],
slurm=True,
job_template=job_template)
# In cmd:
!mwf dataset run {dataset_yaml_path} --slurm -jt {sbatch_template_path} -ig 3