Skip to content

Embedding

InspectorEmbeddings

A class to generate and manage embeddings using Azure OpenAI for textual content.

Attributes:

Name Type Description
client AzureOpenAI

The AzureOpenAI client configured for embedding generation.

embedding_float list

The list of generated embeddings in float format.

embedding_bytes list

The list of generated embeddings in byte format.

dimensions int

The dimensionality of the generated embeddings.

data_to_vectorstore list

A list of dictionaries prepared for storing in a vector database.

Source code in model/embedding.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class InspectorEmbeddings():
    """
    A class to generate and manage embeddings using Azure OpenAI for textual content.

    Attributes:
        client (AzureOpenAI): The AzureOpenAI client configured for embedding generation.
        embedding_float (list): The list of generated embeddings in float format.
        embedding_bytes (list): The list of generated embeddings in byte format.
        dimensions (int): The dimensionality of the generated embeddings.
        data_to_vectorstore (list): A list of dictionaries prepared for storing in a vector database.
    """
    def __init__(self):
        """
        Initializes the InspectorEmbeddings instance and configures the Azure OpenAI client.
        """
        self.client = None
        self.embedding_float = None
        self.embedding_bytes = None
        self.dimensions = None
        self.data_to_vectorstore = []

    def create_embedding(
            self, 
            content: str,
            dimensions: int = 3072,
            file_name: str = "file_name",
            chunk_size: int = 8000,
            service: str = "azure",
        )->list:
        """
        Creates embeddings for the given content using Azure OpenAI.

        Args:
            content (str): The textual content to generate embeddings for.
            dimensions (int): The number of dimensions for the embeddings. Defaults to 3072.
            file_name (str): The name of the file associated with the content. Defaults to "file_name".
            chunk_size (int): The maximum size of text chunks. Defaults to 8000.
            service (str): Service of the LLM. "azure" or "openai"

        Returns:
            list: A list of embeddings in float format.
        """
        self.text_splitted_list = SplitText(chunk_size).split_text(content)

        # Remove blank items
        text_list = [item for item in self.text_splitted_list if item]

        # Create embeddings with Azure OpenAI
        try:
            if service == "azure":
                self.client = AzureOpenAI(
                    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
                    api_version = os.getenv("OPENAI_API_VERSION"),
                    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
                    azure_deployment = os.getenv("AZURE_EMBEDDING_DEPLOYMENT"),
                    )
            elif service == "openai":
                self.client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))             

            self.embedding = self.client.embeddings.create(
                input=text_list,
                model='text-embedding-3-large',
                dimensions=dimensions,
            )
        except Exception as e:
            raise RuntimeError(f"Error to create embeddings: {e}")

        self.embedding_float = [emb.embedding for emb in self.embedding.data]
        self.embedding_bytes = [np.array(emb, dtype=np.float32).tobytes() for emb in self.embedding_float]
        self.dimensions = len(self.embedding.data[0].embedding)
        self.text_list = text_list
        self.file_name = file_name

        return self.embedding_float


    def prepare_data(self)->list:
        """
        Prepares the embedding data for storage in a vector database.

        Returns:
            list: A list of dictionaries containing file information, section number, text, and embedding.
        """
        for i, text in enumerate(self.text_list):
            self.data_to_vectorstore.append(
                {
                    "file_name": self.file_name,
                    "section": i+1,
                    "text": text,
                    "embedding": self.embedding_bytes[i],
                } 
            )
        return self.data_to_vectorstore

__init__()

Initializes the InspectorEmbeddings instance and configures the Azure OpenAI client.

Source code in model/embedding.py
22
23
24
25
26
27
28
29
30
def __init__(self):
    """
    Initializes the InspectorEmbeddings instance and configures the Azure OpenAI client.
    """
    self.client = None
    self.embedding_float = None
    self.embedding_bytes = None
    self.dimensions = None
    self.data_to_vectorstore = []

create_embedding(content, dimensions=3072, file_name='file_name', chunk_size=8000, service='azure')

Creates embeddings for the given content using Azure OpenAI.

Parameters:

Name Type Description Default
content str

The textual content to generate embeddings for.

required
dimensions int

The number of dimensions for the embeddings. Defaults to 3072.

3072
file_name str

The name of the file associated with the content. Defaults to "file_name".

'file_name'
chunk_size int

The maximum size of text chunks. Defaults to 8000.

8000
service str

Service of the LLM. "azure" or "openai"

'azure'

Returns:

Name Type Description
list list

A list of embeddings in float format.

Source code in model/embedding.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def create_embedding(
        self, 
        content: str,
        dimensions: int = 3072,
        file_name: str = "file_name",
        chunk_size: int = 8000,
        service: str = "azure",
    )->list:
    """
    Creates embeddings for the given content using Azure OpenAI.

    Args:
        content (str): The textual content to generate embeddings for.
        dimensions (int): The number of dimensions for the embeddings. Defaults to 3072.
        file_name (str): The name of the file associated with the content. Defaults to "file_name".
        chunk_size (int): The maximum size of text chunks. Defaults to 8000.
        service (str): Service of the LLM. "azure" or "openai"

    Returns:
        list: A list of embeddings in float format.
    """
    self.text_splitted_list = SplitText(chunk_size).split_text(content)

    # Remove blank items
    text_list = [item for item in self.text_splitted_list if item]

    # Create embeddings with Azure OpenAI
    try:
        if service == "azure":
            self.client = AzureOpenAI(
                api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
                api_version = os.getenv("OPENAI_API_VERSION"),
                azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
                azure_deployment = os.getenv("AZURE_EMBEDDING_DEPLOYMENT"),
                )
        elif service == "openai":
            self.client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))             

        self.embedding = self.client.embeddings.create(
            input=text_list,
            model='text-embedding-3-large',
            dimensions=dimensions,
        )
    except Exception as e:
        raise RuntimeError(f"Error to create embeddings: {e}")

    self.embedding_float = [emb.embedding for emb in self.embedding.data]
    self.embedding_bytes = [np.array(emb, dtype=np.float32).tobytes() for emb in self.embedding_float]
    self.dimensions = len(self.embedding.data[0].embedding)
    self.text_list = text_list
    self.file_name = file_name

    return self.embedding_float

prepare_data()

Prepares the embedding data for storage in a vector database.

Returns:

Name Type Description
list list

A list of dictionaries containing file information, section number, text, and embedding.

Source code in model/embedding.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def prepare_data(self)->list:
    """
    Prepares the embedding data for storage in a vector database.

    Returns:
        list: A list of dictionaries containing file information, section number, text, and embedding.
    """
    for i, text in enumerate(self.text_list):
        self.data_to_vectorstore.append(
            {
                "file_name": self.file_name,
                "section": i+1,
                "text": text,
                "embedding": self.embedding_bytes[i],
            } 
        )
    return self.data_to_vectorstore