public byte[] cacheFile(String toSpeak, OutputFormat format) throws IOException { byte[] mp3File = null; // cache it begin ----- String localFileName = getLocalFileName(this, toSpeak, "mp3"); // String filename = AudioFile.globalFileCacheDir + File.separator + // localFileName; if (!audioFile.cacheContains(localFileName)) { log.info("retrieving speech from Amazon - {}", localFileName); AmazonPollyClient polly = getPolly(); SynthesizeSpeechRequest synthReq = new SynthesizeSpeechRequest().withText(toSpeak).withVoiceId(awsVoice.getId()).withOutputFormat(format); SynthesizeSpeechResult synthRes = polly.synthesizeSpeech(synthReq); InputStream data = synthRes.getAudioStream(); mp3File = FileIO.toByteArray(data); audioFile.cache(localFileName, mp3File, toSpeak); } else { log.info("using local cached file"); mp3File = FileIO.toByteArray(new File(AudioFile.globalFileCacheDir + File.separator + getLocalFileName(this, toSpeak, "mp3"))); } // invoke("publishStartSpeaking", toSpeak); // audioFile.playBlocking(filename); // invoke("publishEndSpeaking", toSpeak); // log.info("Finished waiting for completion."); return mp3File; }
@Override public void handle(Context ctx) throws Exception { String voiceId = ctx.getRequest().getQueryParams().get("voiceId"); String text = ctx.getRequest().getQueryParams().get("text"); String outputFormat = ctx.getRequest().getQueryParams().get("outputFormat"); SynthesizeSpeechRequest ssRequest = new SynthesizeSpeechRequest(); ssRequest.setVoiceId(voiceId); ssRequest.setOutputFormat(outputFormat); ssRequest.setText(text); SynthesizeSpeechResult result = polly.synthesizeSpeech(ssRequest); ctx.getResponse().contentType(result.getContentType()); ctx.getResponse().sendStream(s -> s.onSubscribe(new Subscription() { @Override public void request(long n) { try { byte[] data = new byte[1024]; int bytesRead = result.getAudioStream().read(data); while(bytesRead != -1) { s.onNext(Unpooled.wrappedBuffer(data)); bytesRead = result.getAudioStream().read(data); } } catch (IOException e) { ctx.getResponse().status(500); ctx.getResponse().send(); } finally { s.onComplete(); } } @Override public void cancel() { } })); }
/** * Returns text-to-speech of a translation of a given text. Before translating the text, * requesting speech from AWS Polly and storing the resulting MP3 to S3 this method looks * up previous translation of the same text. Once found it will avoid doing the aforementioned * roundtrip but rather will use the data of the previous translation. * @param text text to translate and convert to speech * @return text to speech information * @throws AlexaStateException error reading or writing state to Dynamo dictionary */ public Optional<TextToSpeech> textToSpeech(final String text) throws AlexaStateException { // remove invalid prefixes that accidently made it into the slots final String textToTranslate = prefixesToRemove.stream() .filter(prefix -> StringUtils.startsWithIgnoreCase(text, prefix)) .findFirst() .map(prefix -> text.replaceFirst(prefix, "")) // if none of these prefixes exist in the text, keep the text as is .orElse(text); // look up previous translation in dictionary Optional<TextToSpeech> tts = dynamoStateHandler.readModel(TextToSpeech.class, getDictionaryId(textToTranslate)); // if there was a previous tts for this text return immediately (exception for the roundtrip-phrase used by the test-client) if (tts.isPresent() && !StringUtils.equalsIgnoreCase(textToTranslate, SkillConfig.getAlwaysRoundTripPhrase())) { // set handler to session to avoid writing back to dynamo (nothing changed) tts.get().setHandler(sessionStateHandler); return tts; } // translate term by leveraging a Translator implementation provided by the factory final Optional<String> translated = translator.translate(textToTranslate, language); if (translated.isPresent()) { // without a voiceId there's not chance to fulfill the translation request Validate.notBlank(voiceId, "No voiceId is associated with given language."); // form the SSML by embedding the translated text final String ssml = String.format("<speak><amazon:effect name='drc'><prosody rate='-15%%' volume='x-loud'>%1$s</prosody></amazon:effect><break time='250ms' /></speak>", translated.get()); // build a Polly request to get speech with desired voice and SSML final SynthesizeSpeechRequest synthRequest = new SynthesizeSpeechRequest() .withText(ssml) .withOutputFormat(OutputFormat.Mp3) .withVoiceId(voiceId) .withTextType(TextType.Ssml) .withSampleRate("22050"); // fire request to Polly final SynthesizeSpeechResult synthResult = awsPolly.synthesizeSpeech(synthRequest); try { // store audio stream of Polly to S3 as an MP3 file final PutObjectRequest s3Put = new PutObjectRequest(SkillConfig.getS3BucketName(), getMp3Path(textToTranslate), synthResult.getAudioStream(), new ObjectMetadata()) .withCannedAcl(CannedAccessControlList.PublicRead); awsS3.putObject(s3Put); // as long as Polly output does not comply with Alexa MP3 format restriction we need to convert the MP3 if (!SkillConfig.shouldSkipMp3Conversion()) { // call the REST service that encapsualtes the FFMPEG conversion on a server final String mp3ConvertedUrl = Mp3Converter.convertMp3(getMp3Path(textToTranslate)); // validate this service returned a url (equal to success) Validate.notBlank(mp3ConvertedUrl, "Conversion service did not return proper return value"); } // build the TTS object with all the information needed to return output speech return Optional.of(getTTS(textToTranslate, translated.get())); } catch (final IOException | URISyntaxException e) { log.error("Error while generating mp3. " + e.getMessage()); } } return Optional.empty(); }