Chroma是Generate Bio开发一款蛋白设计算法,可以支持做骨架设计和序列设计。与Baker团队开发的RFdiffusion有一定相似之处。

开源代码: https://github.com/generatebio/chroma

论文地址:Illuminating protein space with a programmable generative model

基本功能测试

无条件采样(Unconditonal Sampling)

最基础的采样生成,随机生成结构,可以分为单体生成和复合物生成,适合用于随机结构数据集的构建。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import appdirs
import os
from chroma import Chroma
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt"
)
protein = chroma.sample(chain_lengths=[100,200])
protein.to(str(out_dir /f"unconditional_momomer_{TEST_TASK_ID}.pdb"))
无条件单体采样
无条件单体采样
无条件复合物采样
无条件复合物采样

指定二级结构采样(Secondary Structure Sampling)

算法支持指定每个氨基酸残基的二级结构(Helix为H,Sheet为E,loop为T)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import appdirs
import os
from chroma import Chroma, conditioners
from chroma.models import graph_classifier
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
device = 'cuda:0'
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt",
device=device
)
SS = "EEEEETTTEEEEE"

proclass_model = graph_classifier.load_model(
local_model_dir / "chroma_proclass_v1.0.pt",
device=device)
conditioner = conditioners.ProClassConditioner(
"secondary_structure", SS, max_norm=None, model=proclass_model)
protein = chroma.sample(
conditioner=conditioner,
chain_lengths=[len(SS)]
)
protein.to(str(out_dir / f"ss_design_{TEST_TASK_ID}.pdb"))

符号形状采样(Symbol-like Structure Sampling)

这是一种炫技的蛋白结构设计,例如让设计蛋白结构像一个“G”。
但实测效果并没有那么像,论文中应该是选了典型case。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import appdirs
import os
from chroma import Chroma, conditioners
from chroma.utility.chroma import letter_to_point_cloud
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
device = 'cuda:0'
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt",
device=device
)
character = "G" # @param {type:"string"}
if len(character) > 1:
character = character[:1]
print(f"Keeping only first character ({character})!")
length = 1000 # @param {type:"slider", min:100, max:1500, step:100}

letter_point_cloud = letter_to_point_cloud(character)
conditioner = conditioners.ShapeConditioner(
letter_point_cloud,
chroma.backbone_network.noise_schedule,
autoscale_num_residues=length,
).to(device)

shaped_protein = chroma.sample(
chain_lengths=[length], conditioner=conditioner
)

shaped_protein.to(str(out_dir / f"shape_design_{TEST_TASK_ID}.pdb"))

CATH采样(CATH-based Structure Sampling)

CATH编号是对蛋白二级结构一种编码,算法能够根据给定CATH编号来生成对应的二级结构。
这实际上上是在前面的二级结构采样基础上,增加了对CATH的查询解析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import appdirs
import os
from chroma import Chroma, conditioners
from chroma.models import graph_classifier
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
device = 'cuda:0'
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt",
device=device
)
CATH = "3.40.50" # @param {type:"string"}
length = 130 # @param {type:"slider", min:50, max:250, step:10}

proclass_model = graph_classifier.load_model(
local_model_dir / "chroma_proclass_v1.0.pt",
device=device)
conditioner = conditioners.ProClassConditioner("cath", CATH, model=proclass_model)
cath_conditioned_protein = chroma.sample(
conditioner=conditioner,
chain_lengths=[length]
)
cath_conditioned_protein.to(str(out_dir / f"fold_design_{TEST_TASK_ID}.pdb"))

提示词采样(Prompt-based Structure Sampling)

内嵌了LLM模型,用来支持基于给定提示词进行结构生成,
但实测效果不咋样,而且这样做的商业意义很有限。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import appdirs
import os
from chroma import Chroma, conditioners
from chroma.models import procap
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
lm_model_dir = Path(appdirs.user_cache_dir(
"huggingface/hub/models--EleutherAI--gpt-neo-125m/snapshots/6cb0d322a3a484e99667e7cb240e22f1ac036b99"))
device = 'cuda:0'
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt",
device=device
)
length = 110 # @param {type:"slider", min:50, max:250, step:10}
caption = "Crystal structure of SH2 domain" # @param {type:"string"}

procap_model = procap.load_model(
local_model_dir / "chroma_procap_v1.0.pt",
device=device,
lm_id=lm_model_dir)
conditioner = conditioners.ProCapConditioner(caption, -1, model=procap_model)
protein = chroma.sample(
steps=200, chain_lengths=[length], conditioner=conditioner
)
protein.to(str(out_dir / f"procap_design_{TEST_TASK_ID}.pdb"))

全结构重采样(Full structure resampling)

算法可以做给定结构的重新采样,但采样效果与原始结构不像。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import appdirs
import os
from chroma import Chroma, Protein
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))
protein = Protein("FC-III-AAPC.pdb", device="cuda:0")
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt"
)
protein = chroma.sample(
protein_init=protein
)
protein.to(str(out_dir / f"full_resample_{TEST_TASK_ID}.pdb"))
full_design_before.png
Init protein: FCIII & Fc fragment
full_design_after.png
Fully resampled

局部重采样(Partial structure resampling)

局部重新采样是指可以选择给定区域进行重新设计而保留其他区域。chroma通过一个类似pymol的selection语法进行结构的局部选择。重新采样设计的差异性较大。测试发现partial resampling其实是对局部结构的完全sampling,不能基于已有的局部结构,当然这可以作为固定receptor进行完全重新设计binder。

同时,测试使用了conditioners.ComposedConditioner([ss_conditioner, sub_conditioner])整合多个conditioner,
但发现会存在互相影响,
例如这里使用了生成指定二级结构类型的conditoner和指定结构区域的conditioner,
发现二级结构类型不好指定生成,生成结构和上面的类似。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import appdirs
import os
import torch
from chroma import Chroma, Protein, conditioners
from chroma.models import graph_classifier
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))

device = "cuda:0" if torch.cuda.is_available() else "cpu"
protein = Protein("FC-III-AAPC.pdb", device=device)
chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt"
)

conditioner = conditioners.SubstructureConditioner(
protein,
chroma.backbone_network,
selection="not chain E").to(device)

protein = chroma.sample(
protein_init=protein,
conditioner=conditioner,
design_selection="chain E",
langevin_factor=1,
inverse_temperature=2,
sde_func='langevin',
)
protein.to(str(out_dir / f"partial_resample_{TEST_TASK_ID}.pdb"))
partial_design_before.png
Init protein: FCIII & Fc fragment
partial_design_after.png
Partial resampling的结果

Denovo Binder Design

Fc Fragment Binder design

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import appdirs
import os
import torch
from chroma import Chroma, Protein, conditioners
from chroma.models import graph_classifier
from pathlib import Path

out_dir = Path.cwd()
TEST_TASK_ID=os.environ.get('SLURM_JOBID', os.getpid())
print("Test task id: ", TEST_TASK_ID)
local_model_dir = Path(appdirs.user_cache_dir("chroma/weights"))

device = "cuda:0" if torch.cuda.is_available() else "cpu"
protein = Protein("igg_fragment.pdb", device=device)

X, C, S = protein.to_XCS()

binder_len = 40

X_new = torch.cat([X, torch.zeros(1, binder_len, 4, 3, device=device)], dim=1)
C_new = torch.cat([C, torch.full((1, binder_len), C.max() + 1, device=device)], dim=1)
S_new = torch.cat([S, torch.full((1, binder_len), 0, device=device)], dim=1)

protein = Protein(X_new, C_new, S_new, device=device)

chroma = Chroma(
weights_backbone=local_model_dir / "chroma_backbone_v1.0.pt",
weights_design=local_model_dir / "chroma_design_v1.0.pt"
)

proclass_model = graph_classifier.load_model(
local_model_dir / "chroma_proclass_v1.0.pt",
device=device)
beta_conditioner = conditioners.ProClassConditioner(
"cath",
"2",
max_norm=None,
model=proclass_model).to(device)

sub_conditioner = conditioners.SubstructureConditioner(
protein,
chroma.backbone_network,
selection="chain A").to(device)

protein = chroma.sample(
protein_init=protein,
conditioner=conditioners.ComposedConditioner([sub_conditioner, beta_conditioner]),
design_selection="not chain A",
langevin_factor=2,
langevin_isothermal=True,
inverse_temperature=8.0,
sde_func='langevin',
steps=800
)
protein.to(str(out_dir / f"binder_design_{TEST_TASK_ID}.pdb"))

通过构造pesudo-chain,利用conditioners.SubstructureConditioner可以实现针对给定receptor的binder生成,同时组合使用conditioners.ProClassConditioner可以引导生成特定类型的二级结构。如示例尝试对Fc生成一个40aa的binder,同时约束其为特定的二级结构,并使用colabfold进行验证。结果表明,chroma的骨架生成模型能生成较好的预期骨架,序列设计也能比较fit骨架,但是对docking pose的设计准确性欠佳(当然可能是colabfold不准)。

fc binder 1
CATH=1 (helix), length=40aa
fc binder 2
CATH=2(sheet), length=40aa

但目前的过程并没有任何对表位或者hotspot残基进行指定的设计,根据原文支持材料,应该是有一个substructure distances的conditioner可以完成这个实现,但在开源版本中并没有相关提供,不过根据这个conditioner开发指南和支持材料中给的逻辑推导,自行实现应该是具有可行度。

总结

Chroma是在RFdiffusion之后一个不错结构diffusion设计算法,但是在给定靶点和表位的设计上,还需要我们做更多技术性验证。