Examples¶

Handling large batches¶

"""In this example, we open a large fasta file with thousands of sequence
(could be millions) and send it little batch by little batch to the API, and save
the outputs (here embeddings) to disk."""

from pathlib import Path
import json

from ginkgo_ai_client import GinkgoAIClient, MeanEmbeddingQuery

input_file = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
output_folder = Path(__file__).parent / "outputs" / "large_batches"
output_folder.mkdir(parents=True, exist_ok=True)

client = GinkgoAIClient()
model = "ginkgo-maskedlm-3utr-v1"
queries = MeanEmbeddingQuery.iter_from_fasta(input_file, model=model)
for batch_result in client.send_requests_by_batches(queries, batch_size=10):
    for query_result in batch_result:
        with open(output_folder / f"{query_result.query_name}.json", "w") as f:
            json.dump(query_result.dict(), f)

Example by application¶

ESM model¶

"""In this example we compute embedding and run masked inference
 on the ESM2 language model."""

from ginkgo_ai_client import (
    GinkgoAIClient,
    MaskedInferenceQuery,
    MeanEmbeddingQuery,
)

client = GinkgoAIClient()
model = "esm2-650M"

# SIMPLE QUERY FOR EMBEDDING COMPUTATION

query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
prediction = client.send_request(query)
# prediction.embedding == [1.05, -2.34, ...]


# SIMPLE QUERY FOR MASKED INFERENCE

query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
prediction = client.send_request(query)
# prediction.sequence == "MLYRRL"

# BATCH REQUEST

queries = [
    MeanEmbeddingQuery(sequence=sequence, model=model)
    for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
]
predictions = client.send_batch_request(queries)
# predictions[0].embedding == [1.05, -2.34, ...]

AA0 model¶

"""In this example we compute embedding and run masked inference
 on the aa0 language model."""

from ginkgo_ai_client import (
    GinkgoAIClient,
    MaskedInferenceQuery,
    MeanEmbeddingQuery,
)

client = GinkgoAIClient()
model = "ginkgo-aa0-650M"

# SIMPLE QUERY FOR EMBEDDING COMPUTATION

query = MeanEmbeddingQuery(sequence="MLYLRRL", model=model)
prediction = client.send_request(query)
# prediction.embedding == [1.05, -2.34, ...]


# SIMPLE QUERY FOR MASKED INFERENCE

query = MaskedInferenceQuery(sequence="MLY<mask>RRL", model=model)
prediction = client.send_request(query)
# prediction.sequence == "MLYRRL"

# BATCH REQUEST

queries = [
    MeanEmbeddingQuery(sequence=sequence, model=model)
    for sequence in ["MLYLRRL", "MLL", "MLYLLRRL"]
]
predictions = client.send_batch_request(queries)
# predictions[0].embedding == [1.05, -2.34, ...]

mRNA discrete diffusion¶

"""In this example for generating 3' and 5' UTRs as well
as codon sequence for a given protein sequence of interest for
encoding in a linear mRNA."""


from ginkgo_ai_client.queries import RNADiffusionMaskedQuery
from ginkgo_ai_client import (
    GinkgoAIClient,
)

client = GinkgoAIClient()
model = "mrna-foundation"

# SIMPLE QUERY FOR GENERATING PARTIAL/FULLY MASKED UTRs

client = GinkgoAIClient()
three_utr="<mask>" * 20
five_utr="AAA<mask>TTTGGGCC<mask><mask>"
protein_sequence="MAKS-" # '-' denotes end of protein sequence
species="HOMO_SAPIENS"

query = RNADiffusionMaskedQuery(
    three_utr=three_utr,
    five_utr=five_utr,
    protein_sequence=protein_sequence,
    species=species,
    model=model,
    temperature=1.0,
    decoding_order_strategy="entropy",
    unmaskings_per_step=10,
    num_samples=1
)
response = client.send_request(query)
samples = response.samples

3’UTR model¶

"""In this example we compute embedding and run masked inference
 on Ginkgo's 3'UTR language model."""

from ginkgo_ai_client import (
    GinkgoAIClient,
    MaskedInferenceQuery,
    MeanEmbeddingQuery,
)

client = GinkgoAIClient()
model = "ginkgo-maskedlm-3utr-v1"

# SIMPLE QUERY FOR EMBEDDING COMPUTATION

query = MeanEmbeddingQuery(sequence="ATTGCG", model=model)
prediction = client.send_request(query)
# prediction.embedding == [1.05, -2.34, ...]


# SIMPLE QUERY FOR MASKED INFERENCE

query = MaskedInferenceQuery(sequence="ATT<mask>TAC", model=model)
prediction = client.send_request(query)

# BATCH REQUEST

queries = [
    MeanEmbeddingQuery(sequence=sequence, model=model)
    for sequence in ["AGCGC", "ATTGCG", "TACCGCA"]
]
predictions = client.send_batch_request(queries)
# predictions[0].embedding == [1.05, -2.34, ...]

Promoter activity with Promoter-0¶

"""
This example shows how to use the PromoterActivityQuery to predict the activity of a
promoter in different tissues, based on the Borzoi model.
"""

from pathlib import Path
from ginkgo_ai_client import GinkgoAIClient, PromoterActivityQuery

client = GinkgoAIClient()
orf_sequence = "tgccagccatctgttgtttgcc"
promoter_sequence = "GTCCCACTGATGAACTGTGCT"


query = PromoterActivityQuery(
    promoter_sequence=promoter_sequence,
    orf_sequence=orf_sequence,
    source="expression",
    tissue_of_interest={
        "heart": ["CNhs10608+", "CNhs10612+"],
        "liver": ["CNhs10608+", "CNhs10612+"],
    },
)

response = client.send_request(query)
print("Single-query response:", response)


# In this next example we pull the promoter files from a fasta file and send them
# in batches, writing the results to a JSONL, as they arrive.

fasta_path = Path(__file__).parent / "data" / "100_dna_sequences.fasta"
queries = PromoterActivityQuery.iter_with_promoter_from_fasta(
    fasta_path=fasta_path,
    orf_sequence=orf_sequence,
    source="expression",
    tissue_of_interest={
        "heart": ["CNhs10608+", "CNhs10612+"],
        "liver": ["CNhs10608+", "CNhs10612+"],
    },
)

print("Now sending 100 requests, by batches of 10")
print("Writing results to promoter_activity.jsonl...")
output_file = Path(__file__).parent / "outputs" / "promoter_activity.jsonl"
for batch_result in client.send_requests_by_batches(queries, batch_size=10):
    for query_result in batch_result:
        query_result.write_to_jsonl(output_file)