Source code for ginkgo_ai_client.query_parameters

"""Helpers for generating query parameters for the Ginkgo AI API."""

from typing import Dict


class TRANSFORMS:
    FILL_MASK = {"type": "FILL_MASK"}
    EMBEDDING = {"type": "EMBEDDING"}


[docs] def aa0_mean_embedding_params(sequence: str, model: str = "ginkgo-aa0-650M") -> Dict: """Generate the query parameters for a AA0 mean embedding query. The mean embedding refers to the mean of the token embedding in the encoder's last layer. Parameters ---------- sequence: str The sequence for which to compute the mean embedding. model: str (default: "ginkgo-aa0-650M") The model to use for the embedding (only "ginkgo-aa0-650M" is supported for now). Query results ------------- List[float] The mean embedding of the sequence. Examples -------- >>> client.query(aa0_mean_embedding_params("MLPP<mask>PPLM")) >>> # {"embedding": [1.05, 0.002, ...]} """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.EMBEDDING]}
[docs] def aa0_masked_inference_params(sequence: str, model: str = "ginkgo-aa0-650M") -> Dict: """Generate the query parameters for a masked inference query with Ginkgo's AA0 protein-language model. The mean embedding of a protein sequence refers to the mean of the token embedding in the encoder's last layer. Parameters ---------- sequence: str The sequence to unmask. The sequence should be of the form "MLPP<mask>PPLM" with as many masks as desired. model: str (default: "ginkgo-aa0-650M") The model to use for the inference (only "ginkgo-aa0-650M" is supported for now). Query results ------------ {sequence: str} The predicted sequence where every masked position has been replaced by the "ATGC" nucleotide with the highest probability at this position. Examples -------- >>> client.query(aa0_masked_inference_params("MLPP<mask>PPLM<mask>")) >>> # {"sequence": "MLPPKPPLMR"} """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.FILL_MASK]}
[docs] def esm_mean_embedding_params(sequence: str, model: str = "esm2-650M") -> Dict: """Generate the query parameters for mean embedding inference with Ginkgo's AA0 protein-language model. The mean embedding of a protein sequence refers to the mean of the token embedding in the encoder's last layer. Parameters ---------- sequence: str The sequence for which to compute the mean embedding. model: str (default: "esm2-650M") The model to use for the embedding ("esm2-650M" or "esm2-3B"). Query results ------------ List[float] The mean embedding of the sequence. Examples -------- >>> client.query(esm_mean_embedding_params("MLPP<mask>PPLM")) >>> # {"embedding": [1.05, 0.002, ...]} """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.EMBEDDING]}
[docs] def esm_masked_inference_params(sequence: str, model: str = "esm2-650M") -> Dict: """Generate the query parameters for a ESM masked inference query. Parameters ---------- sequence: str The sequence to unmask. The sequence should be of the form "MLPP<mask>PPLM" with as many masks as desired. model: str (default: "esm2-650M") The model to use for the inference ("esm2-650M" or "esm2-3B"). Query results ------------ {sequence: str} The predicted sequence where every masked position has been replaced by the "ATGC" nucleotide with the highest probability at this position. Examples -------- >>> client.query(esm_masked_inference_params("MLPP<mask>PPLM<mask>")) >>> # {"sequence": "MLPPKPPLMR"} """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.FILL_MASK]}
[docs] def three_utr_mean_embedding_params( sequence: str, model: str = "ginkgo-maskedlm-3utr-v1" ) -> Dict: """Generate the query parameters for a mean embedding query for Ginkgo's 3UTR language model. The mean embedding refers to the mean of the token embedding in the encoder's last layer. Parameters ---------- sequence: str The sequence for which to compute the mean embedding, of the form "ATGC..." model: str (default: "ginkgo-maskedlm-3utr-v1") The model to use for the embedding (only "ginkgo-maskedlm-3utr-v1" is supported for now). Query results ------------ List[float] The mean embedding of the sequence. Examples -------- >>> client.query(three_utr_mean_embedding_params("MLPP<mask>PPLM<mask>")) >>> # {"embedding": [1.05, 0.002, ...]} """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.EMBEDDING]}
[docs] def three_utr_masked_inference_params( sequence: str, model: str = "ginkgo-maskedlm-3utr-v1" ) -> Dict: """Generate the query parameters for a masked inference query for Ginkgo's 3UTR language model. Parameters ---------- sequence: str The sequence to unmask. The sequence should be of the form "ATGC<mask>ATGC" with as many masks as desired. model: str (default: "ginkgo-maskedlm-3utr-v1") The model to use for the inference (only "ginkgo-maskedlm-3utr-v1" is supported for now). Query results ------------ {sequence: str} The predicted sequence where every masked position has been replaced by the "ATGC" nucleotide with the highest probability at this position. """ return {"model": model, "text": sequence, "transforms": [TRANSFORMS.FILL_MASK]}