HEX

File: //snap/google-cloud-cli/396/lib/surface/ml/speech/__init__.yaml
examples:
  recognize: |
    To get a transcript of an audio file 'my-recording.wav':

      $ {command} 'my-recording.wav' --language-code=en-US

    To get a transcript of an audio file in bucket 'gs://bucket/myaudio' with a
    custom sampling rate and encoding that uses hints and filters profanity:

      $ {command} 'gs://bucket/myaudio' --language-code=es-ES --sample-rate=2200 --hints=Bueno --encoding=OGG_OPUS --filter-profanity

args_v1:
- api_field: audio
  arg_name: audio
  help_text: |
    The location of the audio file to transcribe. Must be a local path or a
    Google Cloud Storage URL (in the format gs://bucket/object).
  is_positional: true
  processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1

args_v1p1beta1:
- api_field: audio
  arg_name: audio
  help_text: |
    The location of the audio file to transcribe. Must be a local path or a
    Google Cloud Storage URL (in the format gs://bucket/object).
  is_positional: true
  processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1p1beta1

- api_field: config.enableWordConfidence
  arg_name: include-word-confidence
  help_text: |
    Include a list of words and the confidence for those words in the top
    result.
- group:
    params:
    - api_field: config.diarizationSpeakerCount
      arg_name: diarization-speaker-count
      type: int
      help_text: |
        Estimated number of speakers in the conversation being recognized.
    - api_field: config.enableSpeakerDiarization
      arg_name: enable-speaker-diarization
      help_text: |
        Enable speaker detection for each recognized word in the top
        alternative of the recognition result using an integer speaker_tag
        provided in the WordInfo.
      type: bool
      required: true

- api_field: config.alternativeLanguageCodes
  arg_name: additional-language-codes
  metavar: language_code
  repeated: true
  help_text: |
    The BCP-47 language tags of other languages that the speech may be
    in. Up to 3 can be provided.

    If alternative languages are listed, recognition result will contain
    recognition in the most likely language detected including the main
    language-code.

args_v1p1beta1_alpha_track:  # available only in 'gcloud alpha'
- arg_name: enable-automatic-punctuation
  api_field: config.enableAutomaticPunctuation
  help_text: |
    Adds punctuation to recognition result hypotheses.
- group:
    help_text: Description of audio data to be recognized.
    params:
    - arg_name: interaction-type
      api_field: config.metadata.interactionType
      help_text: |
        Determining the interaction type in the conversation.
      choices:
      - arg_value: discussion
        enum_value: DISCUSSION
        help_text: Multiple people in a conversation or discussion.
      - arg_value: phone-call
        enum_value: PHONE_CALL
        help_text: A phone-call or video-conference in which two or more people, who are not in the same room, are actively participating.
      - arg_value: voicemail
        enum_value: VOICEMAIL
        help_text: A recorded message intended for another person to listen to.
      - arg_value: professionally-produced
        enum_value: PROFESSIONALLY_PRODUCED
        help_text: Professionally produced audio (eg. TV Show, Podcast).
      - arg_value: voice-search
        enum_value: VOICE_SEARCH
        help_text: Transcribe spoken questions and queries into text.
      - arg_value: voice-command
        enum_value: VOICE_COMMAND
        help_text: Transcribe voice commands, such as for controlling a device.
      - arg_value: dictation
        enum_value: DICTATION
        help_text: Transcribe speech to text to create a written document, such as a text-message, email or report.

    - arg_name: naics-code
      api_field: config.metadata.industryNaicsCodeOfAudio
      type: int
      help_text: |
        The industry vertical to which this speech recognition request most closely applies.
    - arg_name: microphone-distance
      api_field: config.metadata.microphoneDistance
      help_text: |
        The distance at which the audio device is placed to record the conversation.
      choices:
      - arg_value: nearfield
        enum_value: NEARFIELD
        help_text: The speaker is within 1 meter of the microphone.
      - arg_value: midfield
        enum_value: MIDFIELD
        help_text: The speaker is within 3 meters of the microphone.
      - arg_value: farfield
        enum_value: FARFIELD
        help_text: The speaker is more than 3 meters away from the microphone.
    - arg_name: original-media-type
      api_field: config.metadata.originalMediaType
      help_text: |
        The media type of the original audio conversation.
      choices:
      - arg_value: audio
        enum_value: AUDIO
        help_text: The speech data is an audio recording.
      - arg_value: video
        enum_value: VIDEO
        help_text: The speech data originally recorded on a video.
    - arg_name: recording-device-type
      api_field: config.metadata.recordingDeviceType
      help_text: |
        The device type through which the original audio was recorded on.
      choices:
      - arg_value: smartphone
        enum_value: SMARTPHONE
        help_text: Speech was recorded on a smartphone.
      - arg_value: pc
        enum_value: PC
        help_text: Speech was recorded using a personal computer or tablet.
      - arg_value: phone-line
        enum_value: PHONE_LINE
        help_text: Speech was recorded over a phone line.
      - arg_value: vehicle
        enum_value: VEHICLE
        help_text: Speech was recorded in a vehicle.
      - arg_value: outdoor
        enum_value: OTHER_OUTDOOR_DEVICE
        help_text: Speech was recorded outdoors.
      - arg_value: indoor
        enum_value: OTHER_INDOOR_DEVICE
        help_text: Speech was recorded indoors.
    - arg_name: recording-device-name
      api_field: config.metadata.recordingDeviceName
      help_text: |
        The device used to make the recording.  Examples: `Nexus 5X`, `Polycom SoundStation IP 6000`
    - arg_name: original-mime-type
      api_field:  config.metadata.originalMimeType
      help_text: |
        Mime type of the original audio file. Examples: `audio/m4a`,  `audio/mp3`.
    - arg_name:  audio-topic
      api_field: config.metadata.audioTopic
      help_text: |
        Description of the content, e.g. "Recordings of federal supreme court hearings from 2012".

args:
- group:
    help_text: Audio channel settings.
    params:
    - arg_name: separate-channel-recognition
      api_field: config.enableSeparateRecognitionPerChannel
      required: true
      default: false
      help_text: |
        Recognition result will contain a `channel_tag` field to state which channel that
        result belongs to. If this is not true, only the first channel will be recognized.
    - arg_name: audio-channel-count
      api_field: config.audioChannelCount
      required: true
      type: int
      help_text: |
        The number of channels in the input audio data.  Set this for
        separate-channel-recognition. Valid values are:
        1)LINEAR16 and FLAC are `1`-`8`
        2)OGG_OPUS are `1`-`254`
        3) MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
- group:
    mutex: true
    required: true
    params:
    - api_field: config.languageCode
      arg_name: language-code
      help_text: |
        The language of the supplied audio as a BCP-47
        (https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
        "en-US". See https://cloud.google.com/speech/docs/languages for a list
        of the currently supported language codes.
    - api_field: config.languageCode
      arg_name: language
      hidden: true
      action:
        deprecated:
          warn: Flag {flag_name} is deprecated. Use --language-code instead.
      help_text: |
        The language of the supplied audio as a BCP-47
        (https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
        "en-US". See https://cloud.google.com/speech/docs/languages for a list
        of the currently supported language codes.

- api_field: config.speechContexts.phrases
  arg_name: hints
  default: []
  help_text: |
    A list of strings containing word and phrase "hints" so that the speech
    recognition is more likely to recognize them. This can be used to
    improve the accuracy for specific words and phrases, for example, if
    specific commands are typically spoken by the user. This can also be
    used to add additional words to the vocabulary of the recognizer. See
    https://cloud.google.com/speech/limits#content.

- api_field: config.maxAlternatives
  arg_name: max-alternatives
  default: 1
  help_text: |
    Maximum number of recognition hypotheses to be returned. The server
    may return fewer than max_alternatives. Valid values are 0-30. A value
    of 0 or 1 will return a maximum of one.

- api_field: config.profanityFilter
  arg_name: filter-profanity
  help_text: |
    If True, the server will attempt to filter out profanities, replacing
    all but the initial character in each filtered word with asterisks,
    e.g. ```f***```.

- api_field: config.encoding
  arg_name: encoding
  default: encoding-unspecified
  help_text: |
    The type of encoding of the file. Required if the file format is not
    WAV or FLAC.

- api_field: config.sampleRateHertz
  arg_name: sample-rate
  help_text: |
    The sample rate in Hertz. For best results, set the sampling rate of
    the audio source to 16000 Hz. If that's not possible, use the native
    sample rate of the audio source (instead of re-sampling).

- api_field: config.enableWordTimeOffsets
  arg_name: include-word-time-offsets
  help_text: |
    If True, the top result includes a list of words with the start and
    end time offsets (timestamps) for those words. If False, no word-level
    time offset information is returned.