File: //snap/google-cloud-cli/396/lib/surface/ml/speech/__init__.yaml
examples:
recognize: |
To get a transcript of an audio file 'my-recording.wav':
$ {command} 'my-recording.wav' --language-code=en-US
To get a transcript of an audio file in bucket 'gs://bucket/myaudio' with a
custom sampling rate and encoding that uses hints and filters profanity:
$ {command} 'gs://bucket/myaudio' --language-code=es-ES --sample-rate=2200 --hints=Bueno --encoding=OGG_OPUS --filter-profanity
args_v1:
- api_field: audio
arg_name: audio
help_text: |
The location of the audio file to transcribe. Must be a local path or a
Google Cloud Storage URL (in the format gs://bucket/object).
is_positional: true
processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1
args_v1p1beta1:
- api_field: audio
arg_name: audio
help_text: |
The location of the audio file to transcribe. Must be a local path or a
Google Cloud Storage URL (in the format gs://bucket/object).
is_positional: true
processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1p1beta1
- api_field: config.enableWordConfidence
arg_name: include-word-confidence
help_text: |
Include a list of words and the confidence for those words in the top
result.
- group:
params:
- api_field: config.diarizationSpeakerCount
arg_name: diarization-speaker-count
type: int
help_text: |
Estimated number of speakers in the conversation being recognized.
- api_field: config.enableSpeakerDiarization
arg_name: enable-speaker-diarization
help_text: |
Enable speaker detection for each recognized word in the top
alternative of the recognition result using an integer speaker_tag
provided in the WordInfo.
type: bool
required: true
- api_field: config.alternativeLanguageCodes
arg_name: additional-language-codes
metavar: language_code
repeated: true
help_text: |
The BCP-47 language tags of other languages that the speech may be
in. Up to 3 can be provided.
If alternative languages are listed, recognition result will contain
recognition in the most likely language detected including the main
language-code.
args_v1p1beta1_alpha_track: # available only in 'gcloud alpha'
- arg_name: enable-automatic-punctuation
api_field: config.enableAutomaticPunctuation
help_text: |
Adds punctuation to recognition result hypotheses.
- group:
help_text: Description of audio data to be recognized.
params:
- arg_name: interaction-type
api_field: config.metadata.interactionType
help_text: |
Determining the interaction type in the conversation.
choices:
- arg_value: discussion
enum_value: DISCUSSION
help_text: Multiple people in a conversation or discussion.
- arg_value: phone-call
enum_value: PHONE_CALL
help_text: A phone-call or video-conference in which two or more people, who are not in the same room, are actively participating.
- arg_value: voicemail
enum_value: VOICEMAIL
help_text: A recorded message intended for another person to listen to.
- arg_value: professionally-produced
enum_value: PROFESSIONALLY_PRODUCED
help_text: Professionally produced audio (eg. TV Show, Podcast).
- arg_value: voice-search
enum_value: VOICE_SEARCH
help_text: Transcribe spoken questions and queries into text.
- arg_value: voice-command
enum_value: VOICE_COMMAND
help_text: Transcribe voice commands, such as for controlling a device.
- arg_value: dictation
enum_value: DICTATION
help_text: Transcribe speech to text to create a written document, such as a text-message, email or report.
- arg_name: naics-code
api_field: config.metadata.industryNaicsCodeOfAudio
type: int
help_text: |
The industry vertical to which this speech recognition request most closely applies.
- arg_name: microphone-distance
api_field: config.metadata.microphoneDistance
help_text: |
The distance at which the audio device is placed to record the conversation.
choices:
- arg_value: nearfield
enum_value: NEARFIELD
help_text: The speaker is within 1 meter of the microphone.
- arg_value: midfield
enum_value: MIDFIELD
help_text: The speaker is within 3 meters of the microphone.
- arg_value: farfield
enum_value: FARFIELD
help_text: The speaker is more than 3 meters away from the microphone.
- arg_name: original-media-type
api_field: config.metadata.originalMediaType
help_text: |
The media type of the original audio conversation.
choices:
- arg_value: audio
enum_value: AUDIO
help_text: The speech data is an audio recording.
- arg_value: video
enum_value: VIDEO
help_text: The speech data originally recorded on a video.
- arg_name: recording-device-type
api_field: config.metadata.recordingDeviceType
help_text: |
The device type through which the original audio was recorded on.
choices:
- arg_value: smartphone
enum_value: SMARTPHONE
help_text: Speech was recorded on a smartphone.
- arg_value: pc
enum_value: PC
help_text: Speech was recorded using a personal computer or tablet.
- arg_value: phone-line
enum_value: PHONE_LINE
help_text: Speech was recorded over a phone line.
- arg_value: vehicle
enum_value: VEHICLE
help_text: Speech was recorded in a vehicle.
- arg_value: outdoor
enum_value: OTHER_OUTDOOR_DEVICE
help_text: Speech was recorded outdoors.
- arg_value: indoor
enum_value: OTHER_INDOOR_DEVICE
help_text: Speech was recorded indoors.
- arg_name: recording-device-name
api_field: config.metadata.recordingDeviceName
help_text: |
The device used to make the recording. Examples: `Nexus 5X`, `Polycom SoundStation IP 6000`
- arg_name: original-mime-type
api_field: config.metadata.originalMimeType
help_text: |
Mime type of the original audio file. Examples: `audio/m4a`, `audio/mp3`.
- arg_name: audio-topic
api_field: config.metadata.audioTopic
help_text: |
Description of the content, e.g. "Recordings of federal supreme court hearings from 2012".
args:
- group:
help_text: Audio channel settings.
params:
- arg_name: separate-channel-recognition
api_field: config.enableSeparateRecognitionPerChannel
required: true
default: false
help_text: |
Recognition result will contain a `channel_tag` field to state which channel that
result belongs to. If this is not true, only the first channel will be recognized.
- arg_name: audio-channel-count
api_field: config.audioChannelCount
required: true
type: int
help_text: |
The number of channels in the input audio data. Set this for
separate-channel-recognition. Valid values are:
1)LINEAR16 and FLAC are `1`-`8`
2)OGG_OPUS are `1`-`254`
3) MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
- group:
mutex: true
required: true
params:
- api_field: config.languageCode
arg_name: language-code
help_text: |
The language of the supplied audio as a BCP-47
(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
"en-US". See https://cloud.google.com/speech/docs/languages for a list
of the currently supported language codes.
- api_field: config.languageCode
arg_name: language
hidden: true
action:
deprecated:
warn: Flag {flag_name} is deprecated. Use --language-code instead.
help_text: |
The language of the supplied audio as a BCP-47
(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
"en-US". See https://cloud.google.com/speech/docs/languages for a list
of the currently supported language codes.
- api_field: config.speechContexts.phrases
arg_name: hints
default: []
help_text: |
A list of strings containing word and phrase "hints" so that the speech
recognition is more likely to recognize them. This can be used to
improve the accuracy for specific words and phrases, for example, if
specific commands are typically spoken by the user. This can also be
used to add additional words to the vocabulary of the recognizer. See
https://cloud.google.com/speech/limits#content.
- api_field: config.maxAlternatives
arg_name: max-alternatives
default: 1
help_text: |
Maximum number of recognition hypotheses to be returned. The server
may return fewer than max_alternatives. Valid values are 0-30. A value
of 0 or 1 will return a maximum of one.
- api_field: config.profanityFilter
arg_name: filter-profanity
help_text: |
If True, the server will attempt to filter out profanities, replacing
all but the initial character in each filtered word with asterisks,
e.g. ```f***```.
- api_field: config.encoding
arg_name: encoding
default: encoding-unspecified
help_text: |
The type of encoding of the file. Required if the file format is not
WAV or FLAC.
- api_field: config.sampleRateHertz
arg_name: sample-rate
help_text: |
The sample rate in Hertz. For best results, set the sampling rate of
the audio source to 16000 Hz. If that's not possible, use the native
sample rate of the audio source (instead of re-sampling).
- api_field: config.enableWordTimeOffsets
arg_name: include-word-time-offsets
help_text: |
If True, the top result includes a list of words with the start and
end time offsets (timestamps) for those words. If False, no word-level
time offset information is returned.