Unity整合百度語音識別和合成--REST API
阿新 • • 發佈:2019-01-01
直接上unity的C#指令碼程式碼
百度語音識別
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System.Xml;
using LitJson;
using System.Text;
using System;
using UnityEngine.UI;
using System.IO;
public class showVoiceResult1 : MonoBehaviour {
private string token; //access_token
private string cuid = "liang"; //使用者標識
private string format = "wav"; //語音格式
private int rate = 8000; //取樣率
private int channel = 1; //聲道數
private string speech; //語音資料,進行base64編碼
private int len; //原始語音長度
private string lan = "zh"; //語種
private string grant_Type = "client_credentials";
private string client_ID = "********"; //百度appkey
private string client_Secret = "******"; //百度Secret Key
private string baiduAPI = "http://vop.baidu.com/server_api";
private string getTokenAPIPath = "https://openapi.baidu.com/oauth/2.0/token";
private byte[] clipByte;
public Text debugText;
/// <summary>
///
/// 轉換出來的TEXT
/// </summary>
public static string audioToString;
private AudioSource aud;
private int audioLength;//錄音的長度
void Start () {
}
// Update is called once per frame
void Update () {
debugText.text = audioToString;
}
/// <summary>
/// 獲取百度使用者令牌
/// </summary>
/// <param name="url">獲取的url</param>
/// <returns></returns>
private IEnumerator GetToken(string url)
{
WWWForm getTForm = new WWWForm();
getTForm.AddField("grant_type", grant_Type);
getTForm.AddField("client_id", client_ID);
getTForm.AddField("client_secret", client_Secret);
WWW getTW = new WWW(url, getTForm);
yield return getTW;
if (getTW.isDone)
{
if (getTW.error == null)
{
token = JsonMapper.ToObject(getTW.text)["access_token"].ToString();
StartCoroutine(GetAudioString(baiduAPI));
}
else
Debug.LogError(getTW.error);
}
}
private IEnumerator GetAudioString(string url)
{
JsonWriter jw = new JsonWriter();
jw.WriteObjectStart();
jw.WritePropertyName("format");
jw.Write(format);
jw.WritePropertyName("rate");
jw.Write(rate);
jw.WritePropertyName("channel");
jw.Write(channel);
jw.WritePropertyName("token");
jw.Write(token);
jw.WritePropertyName("cuid");
jw.Write(cuid);
jw.WritePropertyName("len");
jw.Write(len);
jw.WritePropertyName("speech");
jw.Write(speech);
jw.WriteObjectEnd();
WWWForm w = new WWWForm();
WWW getASW = new WWW(url, Encoding.Default.GetBytes(jw.ToString()));
yield return getASW;
if (getASW.isDone)
{
if (getASW.error == null)
{
JsonData getASWJson = JsonMapper.ToObject(getASW.text);
if (getASWJson["err_msg"].ToString() == "success.")
{
audioToString = getASWJson["result"][0].ToString();
if (audioToString.Substring(audioToString.Length - 1) == ",")
audioToString = audioToString.Substring(0, audioToString.Length - 1);
Debug.Log(audioToString);
}
}
else
{
Debug.LogError(getASW.error);
}
}
}
public void StartMic()
{
if (Microphone.devices.Length == 0) {
Debug.Log ("no devices");
return;
}
Microphone.End(null);
Debug.Log("Start");
Debug.Log(Microphone.devices);
aud.clip = Microphone.Start("Built-in Microphone", false, 10, rate);
}
/// <summary>
/// 結束錄音
/// </summary>
public void EndMic()
{
int lastPos = Microphone.GetPosition(null);
if (Microphone.IsRecording(null))
audioLength = lastPos / rate;//錄音時長
else
audioLength = 10;
Debug.Log("Stop");
Microphone.End(null);
clipByte = GetClipData();
len = clipByte.Length;
speech = Convert.ToBase64String(clipByte);
StartCoroutine(GetToken(getTokenAPIPath));
Debug.Log(len);
Debug.Log(audioLength);
}
/// <summary>
/// 把錄音轉換為Byte[]
/// </summary>
/// <returns></returns>
public byte[] GetClipData()
{
if (aud.clip == null)
{
Debug.LogError("錄音資料為空");
return null;
}
float[] samples = new float[aud.clip.samples];
aud.clip.GetData(samples, 0);
byte[] outData = new byte[samples.Length * 2];
int rescaleFactor = 32767; //to convert float to Int16
for (int i = 0; i < samples.Length; i++)
{
short temshort = (short)(samples[i] * rescaleFactor);
byte[] temdata = System.BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
if (outData == null || outData.Length <= 0)
{
Debug.LogError("錄音資料為空");
return null;
}
//return SubByte(outData, 0, audioLength * 8000 * 2);
return outData;
}
}
百度語音主要借鑑了另一篇文章,地址忘了
百度語音合成部分
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System.Xml;
using LitJson;
using System.Text;
using System;
using UnityEngine.UI;
using System.IO;
public class showTextTTSResult : MonoBehaviour {
private string text; //user input text
private string token; //access_token
private string cuid = "***"; //current user id
private int ctp = 1; // client type choose, web is only value 1
private string lan = "zh";
private int spd = 5;
private int pit = 5;
private int vol = 5;
private int per = 3; //person voice
private string grant_Type = "client_credentials";
private string client_ID = "****"; //百度appkey
private string client_Secret = "****"; //百度Secret Key
private string baiduAPI = "http://tsn.baidu.com/text2audio";
private string getTokenAPIPath = "https://openapi.baidu.com/oauth/2.0/token";
private byte[] clipByte;
public Text debugText;
public Text debugMsg;
/// <summary>
///
/// 轉換出來的TEXT
/// </summary>
public static string audioToString;
private AudioSource aud;
private int audioLength;//錄音的長度
private string filePath;
void Start () {
}
// Update is called once per frame
void Update () {
/*if (audioToString != null) {
debugText.text = audioToString;
}*/
}
/// <summary>
/// get token
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
private IEnumerator GetToken(string url)
{
WWWForm getTForm = new WWWForm();
getTForm.AddField("grant_type", grant_Type);
getTForm.AddField("client_id", client_ID);
getTForm.AddField("client_secret", client_Secret);
WWW getTW = new WWW(url, getTForm);
yield return getTW;
Debug.Log (getTW.text);
if (getTW.isDone)
{
if (getTW.error == null)
{
token = JsonMapper.ToObject(getTW.text)["access_token"].ToString();
Debug.Log (token);
debugMsg.text += "token:"+token+"\n";
//StartCoroutine(GetAudioString(baiduAPI));
StartCoroutine(GetTextAudio(baiduAPI));
}
else
Debug.LogError(getTW.error);
}
}
private IEnumerator GetTextAudio(string url){
//url?lan ctp cuid tok tex vol per spd pit
WWWForm getTForm = new WWWForm();
getTForm.AddField ("lan", lan);
getTForm.AddField ("ctp", ctp);
getTForm.AddField ("cuid", cuid);
getTForm.AddField ("tok", token);
getTForm.AddField ("tex", /*WWW.EscapeURL(*/debugText.text/*)*/);
getTForm.AddField ("vol",vol);
getTForm.AddField ("per", per);
getTForm.AddField ("spd", spd);
getTForm.AddField ("pit", pit);
WWW getTW = new WWW (url,getTForm);
yield return getTW;
byte[] s = getTW.bytes;
filePath = Application.persistentDataPath+"/1.mp3";
//filePath = "/data/data/com.example.baiduTTS/1.mp3";
File.Delete (filePath);
if (writeFile (s, filePath)) {
debugMsg.text += "success to translate txt to voice\n";
debugMsg.text += "the voice byte[] length:"+s.Length+"\n";
} else {
debugMsg.text = "fail";
}
WWW w = new WWW ("file://"+filePath);
aud.clip = w.GetAudioClip (false, false, AudioType.MPEG);
Debug.Log (debugText.text);
//debugMsg.text += "txt source:" + debugText.text+"\n";
Debug.Log (s.Length);
if (getTW.isDone) {
if (getTW.error == null) {
//debugMsg.text = "合成成功 音訊位元組長度為"+getTW.bytesDownloaded;
//Debug.Log (getTW.bytesDownloaded);
//JsonData getASWJson = JsonMapper.ToObject (getTW.text);
//Debug.Log (getASWJson.Count);
//Debug.Log (getASWJson["result"]);
}else{
Debug.Log (getTW.error);
}
}
}
private bool writeFile(byte[] readByte,string fileName){
FileStream pFileStream = null;
try{
pFileStream = new FileStream(fileName,FileMode.OpenOrCreate);
pFileStream.Write(readByte,0,readByte.Length);
}catch{
return false;
}finally{
if (pFileStream != null) {
pFileStream.Close ();
}
}
return true;
}
public void startTTS()
{
debugMsg.text = "";
StartCoroutine(GetToken(getTokenAPIPath));
}
public void playAud(){
aud.Play ();
/*if (!aud.isPlaying) {
aud.Play ();
}*/
debugMsg.text += "play the audio:"+aud.isPlaying+"\n";
debugMsg.text += "the audio useful:"+aud.enabled+"\n";
}
}
仿照百度語音識別指令碼寫的,裡面重點主要是獲取的音訊無法在unity直接播放,主要是資料夾許可權問題,unity可讀寫資料夾和Android不一樣,有固定的對應資料夾,Application.persistentDataPath是一個可讀寫資料夾,相關知識有部落格,可自行搜尋,地址忘了。aud.clip = w.GetAudioClip (false, false, AudioType.MPEG);是將MP3檔案賦給unity的音訊物件。