Dataset Example

[1]:
%load_ext autoreload
%autoreload 2
[2]:
from mddb_workflow.core.dataset import Dataset

dataset_dir = '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/'
# YAML use for the configuration of the dataset and the automatic inputs.yaml generation.
# project_directories outside the dataset directory are not allowed
dataset_yaml_path = dataset_dir + "dataset.yaml"
print(dataset_yaml_path)
!cat {dataset_yaml_path}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/dataset.yaml
global:
  project_directories:
     - '[0]*/**/'  # (matches dirs starting with a digit and all their subfolders)
[3]:
dt = Dataset(dataset_yaml_path)
# Print the project directories to verify they are correct
dt.project_directories[:5]
[3]:
['/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6jzh/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7dtd/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6oik/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6j8h/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7cmu/']
[5]:
inputs_template = dataset_dir + "inputs_template.yaml"
print(inputs_template)
!cat {inputs_template}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/inputs_template.yaml
name: "{{ title }} ({{ DIR }})"
description: "{{ title }} (1 ms)"
authors: Agustín García
groups: IRB Barcelona, Orozco lab
citation: null
thanks: null
contact: agustin.garcia@irbbarcelona.org
type: trajectory
program: GROMACS
version: 2025.2
license: This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License
linkcense: tps://creativecommons.org/licenses/by/4.0/
method: Classical MD
accession: null
links:
- name: Structural data source
  url: https://memprotmd.bioch.ox.ac.uk/_ref/PDB/{{ DIR }}
pdb_ids:
  - {{ DIR }}
forced_references: null
framestep: 0.01
timestep: 2
ensemble: NPT
ff: 53A6 GROMOS
wat: TIP3P
boxtype: Cubic
mds:
  - mdir: replica_1
mdref: 0
interactions: null
pbc_selection: auto
collections: mcns
chainnames: null
membranes: null
customs: null
multimeric: null
trjType: large
bucket: 8d3eha
temp: 310
ligands: null

[6]:
import requests

def obtener_titulo(pdb_id):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        return data.get('struct', {}).get('title', '').strip()
    else:
        raise ValueError(f"No se pudo obtener título para PDB {pdb_id}")

dt.generate_inputs_yaml(inputs_template, obtener_titulo)
[12]:
job_template=dataset_dir + "job_template.sh"
print(job_template)
!cat {job_template}
/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/job_template.sh
#!/bin/bash
#SBATCH --job-name={{ pdbIds[0] }}
#SBATCH --output={{ pdbIds[0] }}_%j.out
#SBATCH --error={{ pdbIds[0] }}_%j.err
#SBATCH --mem=64G
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mail-type=END,FAIL
#SBATCH --time=24:00:00
#SBATCH --mail-user=ruben.chaves@irbbarcelona.org

module load anaconda3
conda activate mwf_env
mwf run -top topology.tpr -md replica_1 trajectory.xtc -i membs -ow -nc {% if group == 3 %}-m intrajrity{% endif %}

[4]:
dt.display_status_with_links()
state message group log_file err_file
rel_path
0_to_reimage/6gdg not_run No output log available 3
0_to_reimage/6j8h not_run No output log available 3
0_to_reimage/6jzh not_run No output log available 3
0_to_reimage/6k42 not_run No output log available 3
0_to_reimage/6kux not_run No output log available 3
0_to_reimage/6kuy not_run No output log available 3
0_to_reimage/6ni3 not_run No output log available 3
0_to_reimage/6nt3 not_run No output log available 3
0_to_reimage/6oik not_run No output log available 3
0_to_reimage/6ps5 not_run No output log available 3
0_to_reimage/6qfa not_run No output log available 3
0_to_reimage/6wjc not_run No output log available 3
0_to_reimage/7bz2 not_run No output log available 3
0_to_reimage/7cmu not_run No output log available 3
0_to_reimage/7dhr not_run No output log available 3
0_to_reimage/7dtd not_run No output log available 3
0_to_reimage/7e2y not_run No output log available 3
1_to_run/2lm2 error InputError: Missing input topology file "topology.tpr" 2 logs/mwf_7380854.out logs/mwf_7380854.err
1_to_run/3sn6 error InputError: Missing input topology file "topology.tpr" 2 logs/mwf_7380850.out logs/mwf_7380850.err
1_to_run/6i53 error -> Counting number of frames 1 logs/mwf_7380857.out logs/mwf_7380857.err
1_to_run/6me2 error -> Counting number of frames 1 logs/mwf_7380858.out logs/mwf_7380858.err
1_to_run/6me3 error -> Counting number of frames 1 logs/mwf_7380852.out logs/mwf_7380852.err
1_to_run/6me5 error -> Counting number of frames 1 logs/mwf_7380855.out logs/mwf_7380855.err
1_to_run/6ps7 error -> Counting number of frames 1 logs/mwf_7380853.out logs/mwf_7380853.err
1_to_run/6ps8 error -> Counting number of frames 1 logs/mwf_7380856.out logs/mwf_7380856.err
1_to_run/7e2z error InputError: Missing input topology file "topology.tpr" 2 logs/mwf_7380851.out logs/mwf_7380851.err
2_to_membs/6d6u error TestFailure: Failed to find stable bonds 5 logs/mwf_7451102.out logs/mwf_7451102.err
2_to_membs/6gt3 done Done! 0 logs/mwf_7451113.out logs/mwf_7451113.err
2_to_membs/6j8e error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451112.out logs/mwf_7451112.err
2_to_membs/6kr8 done Done! 0 logs/mwf_7451105.out logs/mwf_7451105.err
2_to_membs/6n4q done Done! 0 logs/mwf_7451098.out logs/mwf_7451098.err
2_to_membs/6n4r error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451104.out logs/mwf_7451104.err
2_to_membs/6nt4 done Done! 0 logs/mwf_7451108.out logs/mwf_7451108.err
2_to_membs/6oij error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451115.out logs/mwf_7451115.err
2_to_membs/6ps2 done Done! 0 logs/mwf_7451096.out logs/mwf_7451096.err
2_to_membs/6vxo done Done! 0 logs/mwf_7451116.out logs/mwf_7451116.err
2_to_membs/6w6o done Done! 0 logs/mwf_7451103.out logs/mwf_7451103.err
2_to_membs/6wgt done Done! 0 logs/mwf_7451097.out logs/mwf_7451097.err
2_to_membs/6zdv done Done! 0 logs/mwf_7451107.out logs/mwf_7451107.err
2_to_membs/7dfp done Done! 0 logs/mwf_7451099.out logs/mwf_7451099.err
2_to_membs/7dhi error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451114.out logs/mwf_7451114.err
2_to_membs/7dtc error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451111.out logs/mwf_7451111.err
2_to_membs/7eoq error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451110.out logs/mwf_7451110.err
2_to_membs/7eor done Done! 0 logs/mwf_7451117.out logs/mwf_7451117.err
2_to_membs/7eos error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451101.out logs/mwf_7451101.err
2_to_membs/7eot done Done! 0 logs/mwf_7451109.out logs/mwf_7451109.err
2_to_membs/7eou error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451095.out logs/mwf_7451095.err
2_to_membs/7jvr error TestFailure: RMSD check has failed: there may be sudden jumps along the trajectory 6 logs/mwf_7451106.out logs/mwf_7451106.err
2_to_membs/7k48 done Done! 0 logs/mwf_7451119.out logs/mwf_7451119.err
2_to_membs/7mix error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451100.out logs/mwf_7451100.err
2_to_membs/7miy error Running BioBB LiPyphilic ZPositions 4 logs/mwf_7451118.out logs/mwf_7451118.err
[5]:
dt.show_groups()
[5]:
message count
group
0 Done! 13
1 -> Counting number of frames 6
2 InputError: Missing input topology file "topol... 3
3 No output log available 17
4 Running BioBB LiPyphilic ZPositions 10
5 TestFailure: Failed to find stable bonds 1
6 TestFailure: RMSD check has failed: there may ... 1
[6]:
!mwf dataset status {dataset_yaml_path}
Group 0:
Message: Done!
Projects:
  - 2_to_membs/6gt3
  - 2_to_membs/6kr8
  - 2_to_membs/6n4q
  - 2_to_membs/6nt4
  - 2_to_membs/6ps2
  - 2_to_membs/6vxo
  - 2_to_membs/6w6o
  - 2_to_membs/6wgt
  - 2_to_membs/6zdv
  - 2_to_membs/7dfp
  - 2_to_membs/7eor
  - 2_to_membs/7eot
  - 2_to_membs/7k48

Group 1:
Message: -> Counting number of frames
Projects:
  - 1_to_run/6i53
  - 1_to_run/6me2
  - 1_to_run/6me3
  - 1_to_run/6me5
  - 1_to_run/6ps7
  - 1_to_run/6ps8

Group 2:
Message: InputError: Missing input topology file "topology.tpr"
Projects:
  - 1_to_run/2lm2
  - 1_to_run/3sn6
  - 1_to_run/7e2z

Group 3:
Message: No output log available
Projects:
  - 0_to_reimage/6gdg
  - 0_to_reimage/6j8h
  - 0_to_reimage/6jzh
  - 0_to_reimage/6k42
  - 0_to_reimage/6kux
  - 0_to_reimage/6kuy
  - 0_to_reimage/6ni3
  - 0_to_reimage/6nt3
  - 0_to_reimage/6oik
  - 0_to_reimage/6ps5
  - 0_to_reimage/6qfa
  - 0_to_reimage/6wjc
  - 0_to_reimage/7bz2
  - 0_to_reimage/7cmu
  - 0_to_reimage/7dhr
  - 0_to_reimage/7dtd
  - 0_to_reimage/7e2y

Group 4:
Message: Running BioBB LiPyphilic ZPositions
Projects:
  - 2_to_membs/6j8e
  - 2_to_membs/6n4r
  - 2_to_membs/6oij
  - 2_to_membs/7dhi
  - 2_to_membs/7dtc
  - 2_to_membs/7eoq
  - 2_to_membs/7eos
  - 2_to_membs/7eou
  - 2_to_membs/7mix
  - 2_to_membs/7miy

Group 5:
Message: TestFailure: Failed to find stable bonds
Projects:
  - 2_to_membs/6d6u

Group 6:
Message: TestFailure: RMSD check has failed: there may be sudden jumps along the trajectory
Projects:
  - 2_to_membs/7jvr

[ ]:
# To launch the workflow with SLURM
dt.launch_workflow(
    include_groups=[3],
    slurm=True,
    job_template=job_template)
# In cmd:
!mwf dataset run {dataset_yaml_path} --slurm -jt {sbatch_template_path} -ig 3