Android AudioRecord錄音 並websocket實時傳輸,AudioTrack 播放wav 音訊,Speex加密
在一家專注於AI音訊公司做了一年,最近正處於預離職狀態,正好剛剛給客戶寫了個關於android音訊方面的demo,花了我足足一天趕出來的,感覺挺全面的決定再努力一點寫個總結。 公司雖小,是和中科院聲學所合作,也和訊飛一樣也有自己關於音訊的一系列語音識別/語音轉寫等引擎,麻雀雖小五臟俱全的感覺。 Android 音訊這塊其實也沒那麼神祕,神祕的地方有專門的C++/演算法工程師等為我們負責,大家都懂得,我只是搬搬磚。
主要涉及3點
- SpeechToText(音訊轉文字:STT): AudioRecord 錄製音訊 並用本地和Socket2中方式上傳 。
- TextToSpeech (文字轉語音:TTS) API獲取音訊流並用AudioTrack 播放。
- Speex 加密
這裡不講TTS/STT底層原理,怎麼實現的呆了這麼久我也只是一點點,一點點而已,涉及人耳聽聲相關函式/聲波/傅立葉分析/一系列複雜函式, 這裡不敢班門弄斧了 感興趣請大家自行Google 。,
AudioRecord 介紹
AudioRecord 過程是一個IPC過程,Java層通過JNI呼叫到native層的AudioRecord,後者通過IAudioRecord介面跨程序呼叫到 AudioFlinger,AudioFlinger負責啟動錄音執行緒,將從錄音資料來源裡採集的音訊資料填充到共享記憶體緩衝區,然後應用程式側從其裡面拷貝資料到自己的緩衝區。
public AudioRecord(int audioSource, //指定聲音源 MediaRecorder.AudioSource.MIC; int sampleRateInHz,//指定取樣率 這裡8000 int channelConfig,//指定聲道數,單聲道 int audioFormat, //指定8/16pcm這裡16bit 模擬訊號轉化為數字訊號時的量化單位 int bufferSizeInBytes)//緩衝區大小根據取樣率 通道 量化引數決定 複製程式碼
1. STT 之本地錄完之後檔案形式上傳
第二步再與socket 上傳比較 //引數初始化 // 音訊輸入-麥克風
public final static int AUDIO_INPUT = MediaRecorder.AudioSource.MIC; public final static int AUDIO_SAMPLE_RATE = 8000; // 44.1KHz,普遍使用的頻率 public final static int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO; public final static int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT; private int bufferSizeInBytes = 0;//緩衝區位元組大小 private AudioRecord audioRecord; private volatile boolean isRecord = false;// volatile 可見性設定正在錄製的狀態 複製程式碼
//建立AudioRecord
private void creatAudioRecord() { // 獲得緩衝區位元組大小 bufferSizeInBytes = AudioRecord.getMinBufferSize(AudioFileUtils.AUDIO_SAMPLE_RATE, AudioFileUtils.CHANNEL_CONFIG, AudioFileUtils.AUDIO_FORMAT); // MONO單聲道 audioRecord = new AudioRecord(AudioFileUtils.AUDIO_INPUT, AudioFileUtils.AUDIO_SAMPLE_RATE, AudioFileUtils.CHANNEL_CONFIG, AudioFileUtils.AUDIO_FORMAT, bufferSizeInBytes); } // @Override public boolean onTouch(View v, MotionEvent event) { AudioRecordUtils utils = AudioRecordUtils.getInstance(); switch (event.getAction()) { case MotionEvent.ACTION_DOWN: utils.startRecordAndFile(); break; case MotionEvent.ACTION_UP: utils.stopRecordAndFile(); Log.d(TAG, "stopRecordAndFile"); stt(); break; } return false; } //開始錄音 public int startRecordAndFile() { Log.d("NLPService", "startRecordAndFile"); // 判斷是否有外部儲存裝置sdcard if (AudioFileUtils.isSdcardExit()) { if (isRecord) { return ErrorCode.E_STATE_RECODING; } else { if (audioRecord == null) { creatAudioRecord(); } audioRecord.startRecording(); // 讓錄製狀態為true isRecord = true; // 開啟音訊檔案寫入執行緒 new Thread(new AudioRecordThread()).start(); return ErrorCode.SUCCESS; } } else { return ErrorCode.E_NOSDCARD; } } //錄音執行緒 class AudioRecordThread implements Runnable { @Override public void run() { writeDateTOFile();// 往檔案中寫入裸資料 AudioFileUtils.raw2Wav(mAudioRaw, mAudioWav, bufferSizeInBytes);// 給裸資料加上標頭檔案 } } // 往檔案中寫入裸資料 private void writeDateTOFile() { Log.d("NLPService", "writeDateTOFile"); // new一個byte陣列用來存一些位元組資料,大小為緩衝區大小 byte[] audiodata = new byte[bufferSizeInBytes]; FileOutputStream fos = null; int readsize = 0; try { File file = new File(mAudioRaw); if (file.exists()) { file.delete(); } fos = new FileOutputStream(file);// 建立一個可存取位元組的檔案 } catch (Exception e) { e.printStackTrace(); } while (isRecord) { readsize = audioRecord.read(audiodata, 0, bufferSizeInBytes); if (AudioRecord.ERROR_INVALID_OPERATION != readsize && fos != null) { try { fos.write(audiodata); } catch (IOException e) { e.printStackTrace(); } } } try { if (fos != null) fos.close();// 關閉寫入流 } catch (IOException e) { e.printStackTrace(); } } //add wav header public static void raw2Wav(String inFilename, String outFilename, int bufferSizeInBytes) { Log.d("NLPService", "raw2Wav"); FileInputStream in = null; RandomAccessFile out = null; byte[] data = new byte[bufferSizeInBytes]; try { in = new FileInputStream(inFilename); out = new RandomAccessFile(outFilename, "rw"); fixWavHeader(out, AUDIO_SAMPLE_RATE, 1, AudioFormat.ENCODING_PCM_16BIT); while (in.read(data) != -1) { out.write(data); } in.close(); out.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private static void fixWavHeader(RandomAccessFile file, int rate, int channels, int format) { try { int blockAlign; if (format == AudioFormat.ENCODING_PCM_16BIT) blockAlign = channels * 2; else blockAlign = channels; int bitsPerSample; if (format == AudioFormat.ENCODING_PCM_16BIT) bitsPerSample = 16; else bitsPerSample = 8; long dataLen = file.length() - 44; // hard coding byte[] header = new byte[44]; header[0] = 'R'; // RIFF/WAVE header header[1] = 'I'; header[2] = 'F'; header[3] = 'F'; header[4] = (byte) ((dataLen + 36) & 0xff); header[5] = (byte) (((dataLen + 36) >> 8) & 0xff); header[6] = (byte) (((dataLen + 36) >> 16) & 0xff); header[7] = (byte) (((dataLen + 36) >> 24) & 0xff); header[8] = 'W'; header[9] = 'A'; header[10] = 'V'; header[11] = 'E'; header[12] = 'f'; // 'fmt ' chunk header[13] = 'm'; header[14] = 't'; header[15] = ' '; header[16] = 16; // 4 bytes: size of 'fmt ' chunk header[17] = 0; header[18] = 0; header[19] = 0; header[20] = 1; // format = 1 header[21] = 0; header[22] = (byte) channels; header[23] = 0; header[24] = (byte) (rate & 0xff); header[25] = (byte) ((rate >> 8) & 0xff); header[26] = (byte) ((rate >> 16) & 0xff); header[27] = (byte) ((rate >> 24) & 0xff); header[28] = (byte) ((rate * blockAlign) & 0xff); header[29] = (byte) (((rate * blockAlign) >> 8) & 0xff); header[30] = (byte) (((rate * blockAlign) >> 16) & 0xff); header[31] = (byte) (((rate * blockAlign) >> 24) & 0xff); header[32] = (byte) (blockAlign); // block align header[33] = 0; header[34] = (byte) bitsPerSample; // bits per sample header[35] = 0; header[36] = 'd'; header[37] = 'a'; header[38] = 't'; header[39] = 'a'; header[40] = (byte) (dataLen & 0xff); header[41] = (byte) ((dataLen >> 8) & 0xff); header[42] = (byte) ((dataLen >> 16) & 0xff); header[43] = (byte) ((dataLen >> 24) & 0xff); file.seek(0); file.write(header, 0, 44); } catch (Exception e) { } finally { } } //檔案上傳結果回撥 public void stt() { File voiceFile = new File(AudioFileUtils.getWavFilePath()); if (!voiceFile.exists()) { return; } RequestBody requestBody = RequestBody.create(MediaType.parse("multipart/form-data"), voiceFile); MultipartBody.Part file = MultipartBody.Part.createFormData("file", voiceFile.getName(), requestBody); NetRequest.sAPIClient.stt(RequestBodyUtil.getParams(), file) .observeOn(AndroidSchedulers.mainThread()) .subscribe(new Action1<STT>() { @Override public void call(STT result) { if (result != null && result.getCount() > 0) { sttTv.setText("結果: " + result.getSegments().get(0).getContent()); } } }); } //記得關閉AudioRecord private void stopRecordAndFile() { if (audioRecord != null) { isRecord = false;// 停止檔案寫入 audioRecord.stop(); audioRecord.release();// 釋放資源 audioRecord = null; } } 複製程式碼
2. STT 之AudioRecord錄製websocket 線上傳輸
WebSocket介紹: 我只記住一點點:它是應用層協議 ,就像http 也是,不過它是一種全雙工通訊, socket 只是TCP/IP 的封裝,不算協議。websocket 第一次需要以http 介面建立長連線,就這麼點了。
//MyWebSocketListenerWebsocket 回撥
class MyWebSocketListener extends WebSocketListener { @Override public void onOpen(WebSocket webSocket, Response response) { output("onOpen: " + "webSocket connect success"); STTWebSocketActivity.this.webSocket = webSocket; startRecordAndFile(); //看清楚了開始錄音函式在這裡,原因由於涉及回撥,當分離時候 處理邏輯複雜 //,而且第二次錄音時候由於服務端WebSocket已經關閉 ,錄音資料不能正常傳輸,需要重新建立連線 } @Override public void onMessage(WebSocket webSocket, final String text) { runOnUiThread(new Runnable() { @Override public void run() { sttTv.setText("Stt result:" + text); } }); output("onMessage1: " + text); } @Override public void onMessage(WebSocket webSocket, ByteString bytes) { output("onMessage2 byteString: " + bytes); } @Override public void onClosing(WebSocket webSocket, int code, String reason) { output("onClosing: " + code + "/" + reason); } @Override public void onClosed(WebSocket webSocket, int code, String reason) { output("onClosed: " + code + "/" + reason); } @Override public void onFailure(WebSocket webSocket, Throwable t, Response response) { output("onFailure: " + t.getMessage()); } private void output(String s) { Log.d("NLPService", s); } } 補充:AudioRecord建立與前面相同 // okhttp 建立websocket 並設定監聽 private void createWebSocket() { Request request = new Request.Builder().url(sttApi).build(); NetRequest.getOkHttpClient().newWebSocket(request, socketListener); } class AudioRecordThread implements Runnable { @Override public void run() { //byteBuffer 緩衝區 (記憶體地址以陣列形式排列,一個基本資料型別的陣列) ByteBuffer audioBuffer = ByteBuffer.allocateDirect(bufferSizeInBytes).order(ByteOrder.LITTLE_ENDIAN);//小端模式 int readSize = 0; Log.d(TAG, "isRecord=" + isRecord); while (isRecord) { readSize = audioRecord.read(audioBuffer, audioBuffer.capacity()); if (readSize == AudioRecord.ERROR_INVALID_OPERATION || readSize == AudioRecord.ERROR_BAD_VALUE) { Log.d("NLPService", "Could not read audio data."); break; } boolean send = webSocket.send(ByteString.of(audioBuffer));//就這麼簡單哈哈 Log.d("NLPService", "send=" + send); audioBuffer.clear();//記住清空 } webSocket.send("close");//錄製完之後傳送約定欄位。通知服務端關閉。 } } 複製程式碼
......然後呢,然後就有資料了 ,就是這麼簡單
......然後老司機就要說了。。。你這沒有加密啊,效率很低啊。在此陳述一點,這裡是轉寫引擎,每次就一句話 ,傳輸資料量本身不大,後端大神們說沒必要加密,然後我就照辦了...當然也可以一邊加密一邊傳輸
3.TTS 之AudioTrack 播放wav檔案
這裡就比較簡單了,okhttp 呼叫API 傳遞text 獲取response 然後用之AudioTrack 播放。這裡是原始音訊流,mediaplayer播放就有點大才小用了(我沒試過),不過 mediaplayer播放也是IPC過程,底層最終也是呼叫AudioTrack 進行播放的。 直接上程式碼 :
public boolean request() { OkHttpClient client = NetRequest.getOkHttpClient(); Request request = new Request.Builder().url(NetRequest.BASE_URL + "api/tts?text=今天是星期三").build(); client.newCall(request).enqueue(new Callback() { @Override public void onFailure(Call call, IOException e) { } @Override public void onResponse(Call call, Response response) throws IOException { play(response.body().bytes()); } }); return true; } public void play( byte[] data) { try { Log.d(TAG, "audioTrack start "); AudioTrack audioTrack = new AudioTrack(mOutput, mSamplingRate, AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT, data.length, AudioTrack.MODE_STATIC); audioTrack.write(data, 0, data.length); audioTrack.play(); while (audioTrack.getPlaybackHeadPosition() < (data.length / 2)) { Thread.yield();//播放延遲處理...... } audioTrack.stop(); audioTrack.release(); } catch (IllegalArgumentException e) { } catch (IllegalStateException e) { } } 複製程式碼
4.speex 加密
speex 是一個開源免費的音訊加密庫,C++ 寫的。demo裡面是編譯好的so 檔案, ,我親自編譯了好久各種坑,最後沒成功,只能借用了。-_-||。 下面有個speexDemo整個專案在工程裡,音訊加密解密都正常,親測可用。學習這塊時候CSDN下來的,搬過來湊合數。
public static void raw2spx(String inFileName, String outFileName) { FileInputStream rawFileInputStream = null; FileOutputStream fileOutputStream = null; try { rawFileInputStream = new FileInputStream(inFileName); fileOutputStream = new FileOutputStream(outFileName); byte[] rawbyte = new byte[320]; byte[] encoded = new byte[160]; //將原資料轉換成spx壓縮的檔案,speex只能編碼160位元組的資料,需要使用一個迴圈 int readedtotal = 0; int size = 0; int encodedtotal = 0; while ((size = rawFileInputStream.read(rawbyte, 0, 320)) != -1) { readedtotal = readedtotal + size; short[] rawdata = ShortByteUtil.byteArray2ShortArray(rawbyte); int encodesize = SpeexUtil.getInstance().encode(rawdata, 0, encoded, rawdata.length); fileOutputStream.write(encoded, 0, encodesize); encodedtotal = encodedtotal + encodesize; } fileOutputStream.close(); rawFileInputStream.close(); } catch (Exception e) { } } 複製程式碼