> ## Documentation Index
> Fetch the complete documentation index at: https://developers.telnyx.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Transcribe speech to text

> Transcribe speech to text. This endpoint is consistent with the [OpenAI Transcription API](https://platform.openai.com/docs/api-reference/audio/createTranscription) and may be used with the OpenAI JS or Python SDK.



## OpenAPI

````yaml https://telnyx-openapi-ng.s3.us-east-1.amazonaws.com/speech-to-text-filebased/speech-to-text-filebased.yml post /ai/audio/transcriptions
openapi: 3.1.0
info:
  title: Speech to Text File Based API
  version: 2.0.0
  description: API for managing Speech to Text File Based.
  contact:
    email: support@telnyx.com
servers:
  - url: https://api.telnyx.com/v2
security:
  - bearerAuth: []
tags:
  - name: Speech to Text File Based
    description: Speech to Text File Based operations
paths:
  /ai/audio/transcriptions:
    post:
      tags:
        - Audio
      summary: Transcribe speech to text
      description: >-
        Transcribe speech to text. This endpoint is consistent with the [OpenAI
        Transcription
        API](https://platform.openai.com/docs/api-reference/audio/createTranscription)
        and may be used with the OpenAI JS or Python SDK.
      operationId: audio_public_audio_transcriptions_post
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/AudioTranscriptionRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/AudioTranscriptionResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      x-codeSamples:
        - lang: JavaScript
          source: >-
            import fs from 'fs';

            import Telnyx from 'telnyx';


            const client = new Telnyx({
              apiKey: process.env['TELNYX_API_KEY'], // This is the default and can be omitted
            });


            const response = await client.ai.audio.transcribe({ model:
            'distil-whisper/distil-large-v2' });


            console.log(response.text);
        - lang: Python
          source: |-
            import os
            from telnyx import Telnyx

            client = Telnyx(
                api_key=os.environ.get("TELNYX_API_KEY"),  # This is the default and can be omitted
            )
            response = client.ai.audio.transcribe(
                model="distil-whisper/distil-large-v2",
            )
            print(response.text)
        - lang: Go
          source: "package main\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\n\t\"github.com/team-telnyx/telnyx-go\"\n\t\"github.com/team-telnyx/telnyx-go/option\"\n)\n\nfunc main() {\n\tclient := telnyx.NewClient(\n\t\toption.WithAPIKey(\"My API Key\"),\n\t)\n\tresponse, err := client.AI.Audio.Transcribe(context.TODO(), telnyx.AIAudioTranscribeParams{\n\t\tModel: telnyx.AIAudioTranscribeParamsModelDistilWhisperDistilLargeV2,\n\t})\n\tif err != nil {\n\t\tpanic(err.Error())\n\t}\n\tfmt.Printf(\"%+v\\n\", response.Text)\n}\n"
        - lang: Java
          source: |-
            package com.telnyx.sdk.example;

            import com.telnyx.sdk.client.TelnyxClient;
            import com.telnyx.sdk.client.okhttp.TelnyxOkHttpClient;
            import com.telnyx.sdk.models.ai.audio.AudioTranscribeParams;
            import com.telnyx.sdk.models.ai.audio.AudioTranscribeResponse;

            public final class Main {
                private Main() {}

                public static void main(String[] args) {
                    TelnyxClient client = TelnyxOkHttpClient.fromEnv();

                    AudioTranscribeParams params = AudioTranscribeParams.builder()
                        .model(AudioTranscribeParams.Model.DISTIL_WHISPER_DISTIL_LARGE_V2)
                        .build();
                    AudioTranscribeResponse response = client.ai().audio().transcribe(params);
                }
            }
        - lang: Ruby
          source: >-
            require "telnyx"


            telnyx = Telnyx::Client.new(api_key: "My API Key")


            response = telnyx.ai.audio.transcribe(model:
            :"distil-whisper/distil-large-v2")


            puts(response)
        - lang: CLI
          source: |-
            telnyx ai:audio transcribe \
              --api-key 'My API Key' \
              --model distil-whisper/distil-large-v2
components:
  schemas:
    AudioTranscriptionRequest:
      type: object
      properties:
        file:
          description: >-
            The audio file object to transcribe, in one of these formats: flac,
            mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. File uploads are
            limited to 100 MB. Cannot be used together with `file_url`. Note:
            `deepgram/nova-3` only supports mp3 and wav formats.
          type: string
          format: binary
        file_url:
          description: >-
            Link to audio file in one of these formats: flac, mp3, mp4, mpeg,
            mpga, m4a, ogg, wav, or webm. Support for hosted files is limited to
            100MB. Cannot be used together with `file`. Note: `deepgram/nova-3`
            only supports mp3 and wav formats.
          type: string
          example: https://example.com/file.mp3
        model:
          description: >-
            ID of the model to use. `distil-whisper/distil-large-v2` is lower
            latency but English-only. `openai/whisper-large-v3-turbo` is
            multi-lingual but slightly higher latency. `deepgram/nova-3`
            supports English variants (en, en-US, en-GB, en-AU, en-NZ, en-IN)
            and only accepts mp3/wav files.
          example: distil-whisper/distil-large-v2
          default: distil-whisper/distil-large-v2
          type: string
          enum:
            - distil-whisper/distil-large-v2
            - openai/whisper-large-v3-turbo
            - deepgram/nova-3
        response_format:
          description: >-
            The format of the transcript output. Use `verbose_json` to take
            advantage of timestamps.
          example: json
          type: string
          default: json
          enum:
            - json
            - verbose_json
        timestamp_granularities[]:
          description: >-
            The timestamp granularities to populate for this transcription.
            `response_format` must be set verbose_json to use timestamp
            granularities. Currently `segment` is supported.
          example: segment
          type: string
          enum:
            - segment
        language:
          type: string
          description: >-
            The language of the audio to be transcribed. For `deepgram/nova-3`,
            only English variants are supported: `en`, `en-US`, `en-GB`,
            `en-AU`, `en-NZ`, `en-IN`. For `openai/whisper-large-v3-turbo`,
            supports multiple languages. `distil-whisper/distil-large-v2` does
            not support language parameter.
          example: en-US
        model_config:
          type: object
          description: >-
            Additional model-specific configuration parameters. Only allowed
            with `deepgram/nova-3` model. Can include Deepgram-specific options
            such as `smart_format`, `punctuate`, `diarize`, `utterance`,
            `numerals`, and `language`. If `language` is provided both as a
            top-level parameter and in `model_config`, the top-level parameter
            takes precedence.
          additionalProperties: true
          example:
            smart_format: true
            punctuate: true
      required:
        - model
    AudioTranscriptionResponse:
      type: object
      description: >-
        Response fields vary by model. `distil-whisper/distil-large-v2` returns
        `text`, `duration`, and `segments` in `verbose_json` mode.
        `openai/whisper-large-v3-turbo` returns `text` only. `deepgram/nova-3`
        returns `text` and, depending on `model_config`, may include `words`
        with per-word timestamps and speaker labels.
      properties:
        text:
          type: string
          description: The transcribed text for the audio file.
        duration:
          type: number
          description: >-
            The duration of the audio file in seconds. Returned by
            `distil-whisper/distil-large-v2` and `deepgram/nova-3` when
            `response_format` is `verbose_json`. Not returned by
            `openai/whisper-large-v3-turbo`.
        segments:
          type: array
          description: >-
            Segments of the transcribed text and their corresponding details.
            Returned by `distil-whisper/distil-large-v2` when `response_format`
            is `verbose_json`. Not returned by `openai/whisper-large-v3-turbo`.
          items:
            $ref: '#/components/schemas/AudioTranscriptionResponseSegments'
        words:
          type: array
          description: >-
            Word-level timestamps and optional speaker labels. Only returned by
            `deepgram/nova-3` when word-level output is enabled via
            `model_config`.
          items:
            $ref: '#/components/schemas/AudioTranscriptionResponseWord'
      required:
        - text
    HTTPValidationError:
      title: HTTPValidationError
      type: object
      properties:
        detail:
          title: Detail
          type: array
          items:
            $ref: '#/components/schemas/ValidationError'
    AudioTranscriptionResponseSegments:
      type: object
      properties:
        id:
          type: number
          description: Unique identifier of the segment.
        start:
          type: number
          description: Start time of the segment in seconds.
        end:
          type: number
          description: End time of the segment in seconds.
        text:
          type: string
          description: Text content of the segment.
      required:
        - id
        - start
        - end
        - text
    AudioTranscriptionResponseWord:
      type: object
      description: >-
        Word-level timing detail. Only present when using `deepgram/nova-3` with
        `model_config` options that enable word timestamps.
      properties:
        word:
          type: string
          description: The transcribed word.
        start:
          type: number
          description: Start time of the word in seconds.
        end:
          type: number
          description: End time of the word in seconds.
        confidence:
          type: number
          description: Confidence score for the word (0.0 to 1.0).
        speaker:
          type: integer
          description: >-
            Speaker index. Only present when diarization is enabled via
            `model_config`.
      required:
        - word
        - start
        - end
    ValidationError:
      title: ValidationError
      required:
        - loc
        - msg
        - type
      type: object
      properties:
        loc:
          title: Location
          type: array
          items:
            anyOf:
              - type: string
              - type: integer
        msg:
          title: Message
          type: string
        type:
          title: Error Type
          type: string
  securitySchemes:
    bearerAuth:
      scheme: bearer
      type: http

````