1. 程式人生 > >關於語音合成和識別

關於語音合成和識別

沒有 asr 進制 lower 編碼 IV 業務 key -i

最近研究了下語音合成和語音識別。分別看了一些文章,也下載jdk寫了些代碼測試了下。

發現,對於語音合成。中文來說,百度語音和科大訊飛,基本都差不多。

英文的話,百度合成出來的效果不佳。科大訊飛稍好點。但是總體都沒有國外語音合成好。比如 iSpeech、FreeTTS,可能國外的主語都是英語的緣故吧。

百度日調用額度比較多,據說有2萬額度。訊飛每天就500,有點少。iSpeech 是要收費的。FreeTTS 可以離線使用。

百度識別和合成代碼:

public class SoundAPI
{
	private static final Logger logger = LoggerFactory.getLogger(SoundAPI.class);
	final static String FILE_PATH = Config.getString("download.folder");
	// 設置APPID/AK/SK
	private static final String APP_ID = "你的APP ID";
	private static final String API_KEY = "你的key";
	private static final String SECRET_KEY = "你的秘鑰";
	// 初始化一個AipSpeech
	private static AipSpeech client = null;
	private static long iniTime = 0L;
	/** 30 天 24 小時 **/
	private static final long MONTH_TIME = 30 * 24 * 60 * 60 * 1000;
	private static final Base64 base64 = new Base64();

	private static void iniAPI()
	{
		boolean needToReset = false;
		// 判斷是否一個月了,如果一個月後,需要重新初始話
		long currentTime = System.currentTimeMillis();
		if (currentTime - iniTime > MONTH_TIME)
		{
			needToReset = true;
		}
		if (client == null || needToReset)
		{
			client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY);
			/** 2秒超時時間 **/
			client.setConnectionTimeoutInMillis(2000);

			iniTime = System.currentTimeMillis();
		}
	}

	public static String getSoundMp3(String text, String fileName, QuestionTypeEnum questionType)
	{
		String rtnfileName = "";
		String type = "zh";
		if (StringUtils.isEmpty(text))
			return "";

		try
		{
			iniAPI();

			if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
			{
				type = "en";
			}

			TtsResponse res = client.synthesis(text, type, 1, null);
			byte[] data = res.getData();
			if (data != null)
			{

				// String uuid = UUID.randomUUID().toString().replace("-",
				// "").toLowerCase();
				String uuid = base64.encodeToString(fileName.getBytes());
				rtnfileName = type + "/" + uuid.replaceAll("=", "") + ".mp3";
				String path = FILE_PATH + rtnfileName;
				File file = new File(path);
				if (!file.exists())
				{
					Util.writeBytesToFileSystem(data, path);
				}

			} else
			{
				JSONObject jsonObj = res.getResult();
				logger.info("invoke baidu synthesis API error:", jsonObj);
			}
		} catch (Exception e)
		{
			rtnfileName = "";
			logger.error("invoke baidu synthesis API error:", e);
		}

		return rtnfileName;
	}

	public static String recognizeSound(String filePath, QuestionTypeEnum questionType)
	{
		String result = "";
		JSONObject asrRes = null;

		if (StringUtils.isEmpty(filePath))
			return "";

		try
		{
			iniAPI();
			if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
			{
				HashMap<String, Object> options = new HashMap<>();
				options.put("dev_pid", 1737);
				asrRes = client.asr(filePath, "pcm", 16000, options);
			} else
			{
				asrRes = client.asr(filePath, "pcm", 16000, null);
			}

			result = getResult(asrRes);

		} catch (Exception e)
		{
			logger.error("invoke baidu asr API error:", e);
		}

		return result;
	}

	private static String getResult(JSONObject asrRes)
	{
		String result = "";
		if (asrRes.getInt("err_no") == 0)
		{
			JSONArray arrayResult = asrRes.getJSONArray("result");
			StringBuilder sbResult = new StringBuilder();
			for (int i = 0; i < arrayResult.length(); i++)
			{
				if (i == 0)
				{
					sbResult.append(arrayResult.get(i).toString());
				} else
				{
					if (!StringUtils.isEmpty(arrayResult.get(i).toString()))
						sbResult.append(";" + arrayResult.get(i).toString());
				}
			}

			result = sbResult.toString().replaceAll(",", "");
		} else
		{
			logger.error("invoke baidu asr API error:", asrRes);
		}
		return result;
	}

  科大訊飛的語音識別及合成

public class IatAPI
{
	private static final Logger logger = LoggerFactory.getLogger(IatAPI.class);
	/**
	 * 科大訊飛語音識別寫入參考
	 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
	 */
	final static String APPID = "你的APPID";
	final static String APPKEY_IAT = "你的秘鑰";
	final static String URL_IAT = "http://api.xfyun.cn/v1/service/v1/iat";
	final static String IP = "服務器IP地址";

	/**
	 * 
	 * 發送語音,獲取文字
	 * 
	 * @param audioByteArray
	 * @return
	 * @throws Exception
	 */
	public static String process(String filePath) throws Exception
	{
		Map<String, String> header = getHeader("raw", "sms16k");
		// 讀取音頻文件,轉二進制數組,然後Base64編碼
		byte[] audioByteArray = FileUtil.read2ByteArray(filePath);
		String audioBase64 = new String(Base64.encodeBase64(audioByteArray), "UTF-8");
		String bodyParam = "audio=" + audioBase64;
		// logger.info(bodyParam);
		String result = HttpUtil.doPost(URL_IAT, header, bodyParam);

		return result;
	}

	/**
	 * 組裝http請求頭
	 * 
	 * @param aue
	 * @param resultLevel
	 * @param language
	 * @param category
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	private static Map<String, String> getHeader(String aue, String engineType) throws UnsupportedEncodingException
	{
		// 系統當前時間戳
		String X_CurTime = System.currentTimeMillis() / 1000L + "";
		// 業務參數
		String param = "{\"aue\":\"" + aue + "\"" + ",\"engine_type\":\"" + engineType + "\"}";
		String X_Param = new String(Base64.encodeBase64(param.getBytes("UTF-8")));
		// 接口密鑰
		String apiKey = APPKEY_IAT;
		// 訊飛開放平臺應用ID
		String X_Appid = APPID;
		// 生成令牌
		String X_CheckSum = DigestUtils.md5Hex(apiKey + X_CurTime + X_Param);

		// 組裝請求頭
		Map<String, String> header = new HashMap<String, String>();
		header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
		header.put("X-Param", X_Param);
		header.put("X-CurTime", X_CurTime);
		header.put("X-CheckSum", X_CheckSum);
		header.put("X-Appid", X_Appid);
		header.put("X-Real-Ip", IP);
		return header;

	}

  

public class TtsAPI
{
	private static final Logger logger = LoggerFactory.getLogger(TtsAPI.class);
	/**
	 * 科大訊飛語音識別寫入參考
	 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
	 */
	final static String APPID = "你的APP id";
	final static String APPKEY_TTS = "你的秘鑰";
	final static String URL_TTS = "http://api.xfyun.cn/v1/service/v1/tts";
	final static String IP = "服務器地址";
	final static String FILE_PATH = Config.getString("download.folder");

	/**
	 * 
	 * 發送文字,獲取語音
	 * 
	 * @param text
	 * @throws Exception
	 */
	public static String process(String text) throws Exception
	{
		String result = null;
		Long startTime = System.currentTimeMillis();
		try
		{
			Map<String, String> header = getHeader("audio/L16;rate=16000", "lame", "xiaoyan", "50", "50", "", "text",
					"50");
			Map<String, Object> resultMap = HttpUtil.doMultiPost(URL_TTS, header, "text=" + text);
			// 合成成功
			if ("audio/mpeg".equals(resultMap.get("Content-Type")))
			{
				FileUtil.save(FILE_PATH, resultMap.get("sid") + ".mp3", (byte[]) resultMap.get("body"));
				result = resultMap.get("sid") + ".mp3";
			} else
			{ // 合成失敗
				logger.error(resultMap.get("body").toString());
			}
		} catch (Exception e)
		{
			logger.error("there is error:", e);
		}

		Long endTime = System.currentTimeMillis();
		logger.info("finish get voice:" + (endTime - startTime));

		return result;
	}

	/**
	 * 組裝http請求頭
	 * 
	 * @param aue
	 * @param resultLevel
	 * @param language
	 * @param category
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	private static Map<String, String> getHeader(String auf, String aue, String voiceName, String speed, String volume,
			String engineType, String textType, String pitch) throws UnsupportedEncodingException
	{
		String curTime = System.currentTimeMillis() / 1000L + "";
		StringBuilder param = new StringBuilder("{\"auf\":\"" + auf + "\"");
		if (!StringUtil.isNullOrEmpty(aue))
		{
			param.append(",\"aue\":\"" + aue + "\"");
		}
		if (!StringUtil.isNullOrEmpty(voiceName))
		{
			param.append(",\"voice_name\":\"" + voiceName + "\"");
		}
		if (!StringUtil.isNullOrEmpty(speed))
		{
			param.append(",\"speed\":\"" + speed + "\"");
		}
		if (!StringUtil.isNullOrEmpty(volume))
		{
			param.append(",\"volume\":\"" + volume + "\"");
		}
		if (!StringUtil.isNullOrEmpty(pitch))
		{
			param.append(",\"pitch\":\"" + pitch + "\"");
		}
		if (!StringUtil.isNullOrEmpty(engineType))
		{
			param.append(",\"engine_type\":\"" + engineType + "\"");
		}
		if (!StringUtil.isNullOrEmpty(textType))
		{
			param.append(",\"text_type\":\"" + textType + "\"");
		}
		param.append("}");

		String paramBase64 = new String(Base64.encodeBase64(param.toString().getBytes("UTF-8")));
		String checkSum = DigestUtils.md5Hex(APPKEY_TTS + curTime + paramBase64);
		Map<String, String> header = new HashMap<String, String>();
		header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
		header.put("X-Param", paramBase64);
		header.put("X-CurTime", curTime);
		header.put("X-CheckSum", checkSum);
		header.put("X-Real-Ip", IP);
		header.put("X-Appid", APPID);
		// logger.info(JSON.toJSONString(header));
		return header;
	}

  

關於語音合成和識別