Examples

Handling large batches

 1"""In this example, we open a large fasta file with thousands of sequence
 2(could be millions) and send it little batch by little batch to the API, and save
 3the outputs (here embeddings) to disk."""
 4
 5from pathlib import Path
 6import json
 7
 8from ginkgo_ai_client import GinkgoAIClient, MeanEmbeddingQuery
 9
10input_file = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
11output_folder = Path(__file__).parent / "outputs" / "large_batches"
12output_folder.mkdir(parents=True, exist_ok=True)
13
14client = GinkgoAIClient()
15model = "ginkgo-maskedlm-3utr-v1"
16queries = MeanEmbeddingQuery.iter_from_fasta(input_file, model=model)
17for batch_result in client.send_requests_by_batches(queries, batch_size=10):
18    for query_result in batch_result:
19        with open(output_folder / f"{query_result.query_name}.json", "w") as f:
20            json.dump(query_result.dict(), f)

Example by application

ESM model

 1"""In this example we compute embedding and run masked inference
 2 on the ESM2 language model."""
 3
 4from ginkgo_ai_client import (
 5    GinkgoAIClient,
 6    MaskedInferenceQuery,
 7    MeanEmbeddingQuery,
 8)
 9
10client = GinkgoAIClient()
11model = "esm2-650M"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
23prediction = client.send_request(query)
24# prediction.sequence == "MLYRRL"
25
26# BATCH REQUEST
27
28queries = [
29    MeanEmbeddingQuery(sequence=sequence, model=model)
30    for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
31]
32predictions = client.send_batch_request(queries)
33# predictions[0].embedding == [1.05, -2.34, ...]

AA0 model

 1"""In this example we compute embedding and run masked inference
 2 on the aa0 language model."""
 3
 4from ginkgo_ai_client import (
 5    GinkgoAIClient,
 6    MaskedInferenceQuery,
 7    MeanEmbeddingQuery,
 8)
 9
10client = GinkgoAIClient()
11model = "ginkgo-aa0-650M"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
23prediction = client.send_request(query)
24# prediction.sequence == "MLYRRL"
25
26# BATCH REQUEST
27
28queries = [
29    MeanEmbeddingQuery(sequence=sequence, model=model)
30    for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
31]
32predictions = client.send_batch_request(queries)
33# predictions[0].embedding == [1.05, -2.34, ...]

mRNA discrete diffusion

 1"""In this example for generating 3' and 5' UTRs as well
 2as codon sequence for a given protein sequence of interest for
 3encoding in a linear mRNA."""
 4
 5
 6from ginkgo_ai_client.queries import RNADiffusionMaskedQuery
 7from ginkgo_ai_client import (
 8    GinkgoAIClient,
 9)
10
11client = GinkgoAIClient()
12model = "mrna-foundation"
13
14# SIMPLE QUERY FOR GENERATING PARTIAL/FULLY MASKED UTRs
15
16client = GinkgoAIClient()
17three_utr="<mask>" * 20
18five_utr="AAA<mask>TTTGGGCC<mask><mask>"
19protein_sequence="MAKS-" # '-' denotes end of protein sequence
20species="HOMO_SAPIENS"
21
22query = RNADiffusionMaskedQuery(
23    three_utr=three_utr,
24    five_utr=five_utr,
25    protein_sequence=protein_sequence,
26    species=species,
27    model=model,
28    temperature=1.0,
29    decoding_order_strategy="entropy",
30    unmaskings_per_step=10,
31    num_samples=1
32)
33response = client.send_request(query)
34samples = response.samples
35

3’UTR model

 1"""In this example we compute embedding and run masked inference
 2 on Ginkgo's 3'UTR language model."""
 3
 4from ginkgo_ai_client import (
 5    GinkgoAIClient,
 6    MaskedInferenceQuery,
 7    MeanEmbeddingQuery,
 8)
 9
10client = GinkgoAIClient()
11model = "ginkgo-maskedlm-3utr-v1"
12
13# SIMPLE QUERY FOR EMBEDDING COMPUTATION
14
15query = MeanEmbeddingQuery(sequence="ATTGCG", model=model)
16prediction = client.send_request(query)
17# prediction.embedding == [1.05, -2.34, ...]
18
19
20# SIMPLE QUERY FOR MASKED INFERENCE
21
22query = MaskedInferenceQuery(sequence="ATT<mask>TAC", model=model)
23prediction = client.send_request(query)
24
25# BATCH REQUEST
26
27queries = [
28    MeanEmbeddingQuery(sequence=sequence, model=model)
29    for sequence in ["AGCGC", "ATTGCG", "TACCGCA"]
30]
31predictions = client.send_batch_request(queries)
32# predictions[0].embedding == [1.05, -2.34, ...]

Promoter activity with Promoter-0

 1"""
 2This example shows how to use the PromoterActivityQuery to predict the activity of a
 3promoter in different tissues, based on the Borzoi model.
 4"""
 5
 6from pathlib import Path
 7from ginkgo_ai_client import GinkgoAIClient, PromoterActivityQuery
 8
 9client = GinkgoAIClient()
10orf_sequence = "tgccagccatctgttgtttgcc"
11promoter_sequence = "GTCCCACTGATGAACTGTGCT"
12
13
14query = PromoterActivityQuery(
15    promoter_sequence=promoter_sequence,
16    orf_sequence=orf_sequence,
17    source="expression",
18    tissue_of_interest={
19        "heart": ["CNhs10608+", "CNhs10612+"],
20        "liver": ["CNhs10608+", "CNhs10612+"],
21    },
22)
23
24response = client.send_request(query)
25print("Single-query response:", response)
26
27
28# In this next example we pull the promoter files from a fasta file and send them
29# in batches, writing the results to a JSONL, as they arrive.
30
31fasta_path = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
32queries = PromoterActivityQuery.iter_with_promoter_from_fasta(
33    fasta_path=fasta_path,
34    orf_sequence=orf_sequence,
35    source="expression",
36    tissue_of_interest={
37        "heart": ["CNhs10608+", "CNhs10612+"],
38        "liver": ["CNhs10608+", "CNhs10612+"],
39    },
40)
41
42print("Now sending 100 requests, by batches of 10")
43print("Writing results to promoter_activity.jsonl...")
44output_file = Path(__file__).parent / "outputs" / "promoter_activity.jsonl"
45for batch_result in client.send_requests_by_batches(queries, batch_size=10):
46    for query_result in batch_result:
47        query_result.write_to_jsonl(output_file)