Examples¶
Handling large batches¶
1"""In this example, we open a large fasta file with thousands of sequence
2(could be millions) and send it little batch by little batch to the API, and save
3the outputs (here embeddings) to disk."""
4
5from pathlib import Path
6import json
7
8from ginkgo_ai_client import GinkgoAIClient, MeanEmbeddingQuery
9
10input_file = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
11output_folder = Path(__file__).parent / "outputs" / "large_batches"
12output_folder.mkdir(parents=True, exist_ok=True)
13
14client = GinkgoAIClient()
15model = "ginkgo-maskedlm-3utr-v1"
16queries = MeanEmbeddingQuery.iter_from_fasta(input_file, model=model)
17for batch_result in client.send_requests_by_batches(queries, batch_size=10):
18 for query_result in batch_result:
19 with open(output_folder / f"{query_result.query_name}.json", "w") as f:
20 json.dump(query_result.dict(), f)
Example by application¶
ESM model¶
1"""In this example we compute embedding and run masked inference
2 on the ESM2 language model."""
3
4from ginkgo_ai_client import (
5 GinkgoAIClient,
6 MaskedInferenceQuery,
7 MeanEmbeddingQuery,
8)
9
10client = GinkgoAIClient()
11model = "esm2-650M"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
23prediction = client.send_request(query)
24# prediction.sequence == "MLYRRL"
25
26# BATCH REQUEST
27
28queries = [
29 MeanEmbeddingQuery(sequence=sequence, model=model)
30 for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
31]
32predictions = client.send_batch_request(queries)
33# predictions[0].embedding == [1.05, -2.34, ...]
AA0 model¶
1"""In this example we compute embedding and run masked inference
2 on the aa0 language model."""
3
4from ginkgo_ai_client import (
5 GinkgoAIClient,
6 MaskedInferenceQuery,
7 MeanEmbeddingQuery,
8)
9
10client = GinkgoAIClient()
11model = "ginkgo-aa0-650M"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
23prediction = client.send_request(query)
24# prediction.sequence == "MLYRRL"
25
26# BATCH REQUEST
27
28queries = [
29 MeanEmbeddingQuery(sequence=sequence, model=model)
30 for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
31]
32predictions = client.send_batch_request(queries)
33# predictions[0].embedding == [1.05, -2.34, ...]
3’UTR model¶
1"""In this example we compute embedding and run masked inference
2 on Ginkgo's 3'UTR language model."""
3
4from ginkgo_ai_client import (
5 GinkgoAIClient,
6 MaskedInferenceQuery,
7 MeanEmbeddingQuery,
8)
9
10client = GinkgoAIClient()
11model = "ginkgo-maskedlm-3utr-v1"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="ATTGCG", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="ATT<mask>TAC", model=model)
23prediction = client.send_request(query)
24
25# BATCH REQUEST
26
27queries = [
28 MeanEmbeddingQuery(sequence=sequence, model=model)
29 for sequence in ["AGCGC", "ATTGCG", "TACCGCA"]
30]
31predictions = client.send_batch_request(queries)
32# predictions[0].embedding == [1.05, -2.34, ...]
Promoter activity with Promoter-0¶
1"""
2This example shows how to use the PromoterActivityQuery to predict the activity of a
3promoter in different tissues, based on the Borzoi model.
4"""
5
6from pathlib import Path
7from ginkgo_ai_client import GinkgoAIClient, PromoterActivityQuery
8
9client = GinkgoAIClient()
10orf_sequence = "tgccagccatctgttgtttgcc"
11promoter_sequence = "GTCCCACTGATGAACTGTGCT"
12
13
14query = PromoterActivityQuery(
15 promoter_sequence=promoter_sequence,
16 orf_sequence=orf_sequence,
17 source="expression",
18 tissue_of_interest={
19 "heart": ["CNhs10608+", "CNhs10612+"],
20 "liver": ["CNhs10608+", "CNhs10612+"],
21 },
22)
23
24response = client.send_request(query)
25print("Single-query response:", response)
26
27
28# In this next example we pull the promoter files from a fasta file and send them
29# in batches, writing the results to a JSONL, as they arrive.
30
31fasta_path = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
32queries = PromoterActivityQuery.iter_with_promoter_from_fasta(
33 fasta_path=fasta_path,
34 orf_sequence=orf_sequence,
35 source="expression",
36 tissue_of_interest={
37 "heart": ["CNhs10608+", "CNhs10612+"],
38 "liver": ["CNhs10608+", "CNhs10612+"],
39 },
40)
41
42print("Now sending 100 requests, by batches of 10")
43print("Writing results to promoter_activity.jsonl...")
44output_file = Path(__file__).parent / "outputs" / "promoter_activity.jsonl"
45for batch_result in client.send_requests_by_batches(queries, batch_size=10):
46 for query_result in batch_result:
47 query_result.write_to_jsonl(output_file)
Boltz structure inference¶
Structure inference with a simple (single-chain) protein sequence:
1"""Simple example where we predict the 3D structure of the GFP protein."""
2
3from ginkgo_ai_client import GinkgoAIClient, BoltzStructurePredictionQuery
4
5client = GinkgoAIClient()
6sequence = (
7 "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL"
8 "VTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV"
9 "NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD"
10 "HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
11)
12
13query = BoltzStructurePredictionQuery.from_protein_sequence(sequence)
14response = client.send_request(query)
15response.download_structure("GFP.pdb")
Structure inference with a multimer protein sequence and ligand(s):
1"""We predict the structure of the multimer protein with ligand(s)."""
2
3from ginkgo_ai_client import GinkgoAIClient, BoltzStructurePredictionQuery
4
5client = GinkgoAIClient()
6query = BoltzStructurePredictionQuery.from_yaml_file("with_ligand.yaml")
7print("Sending the request, it might take a hot minute...")
8response = client.send_request(query, timeout=1000)
9response.download_structure("with_ligand.cif")