要解决的问题:
一、访问后台,后台能正常返回失败后的文本。
1.E/FMQ: grantorIdx must be less than 3
2.{"error":"decode ws request failed: unable to decode V1 protocol message: autoAssignedSequence (-86) mismatch sequence in request (86)"} 
3.decode ws request failed: unable to decode V1 protocol message: unable to ungzip
参考豆包官方文档,最后一个包需要-86,传过去的是86,传递的payload内容为null。

二、提高速度。如果存储文件再把文件传递给后台,这样语音转文本就慢了。
一边录音一边传递数据。这样并发执行。

三、后台返回的文字内容,如何断句,达到声音流畅。
根据。!?就断句播放。

package com.*

import android.Manifest;
import android.content.pm.PackageManager;
import android.media.AudioFormat;
import android.media.MediaCodec;
import android.media.MediaExtractor;
import android.media.MediaFormat;
import android.os.Handler;
import android.os.Looper;
import android.util.Log;

import com.google.gson.Gson;
import com.google.gson.JsonObject;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;


import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder;

import androidx.core.app.ActivityCompat;

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.WebSocket;
import okhttp3.WebSocketListener;
import okio.ByteString;

/**
 * 基于火山引擎大模型流式语音识别 API 的语音转文字实现
 */
public class VolcengineSpeechToTextConverter {
    private static final String TAG = "VolcengineSpeechToTextConverter";
    private static final String URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel";
    private static final String APP_ID = "*********";
    private static final String TOKEN = "*******************";
    private static final byte PROTOCOL_VERSION = 0b0001;
    private static final byte DEFAULT_HEADER_SIZE = 0b0001;
    private static final byte FULL_CLIENT_REQUEST = 0b0001;
    private static final byte AUDIO_ONLY_REQUEST = 0b0010;
    private static final byte FULL_SERVER_RESPONSE = 0b1001;
    private static final byte SERVER_ACK = 0b1011;
    private static final byte SERVER_ERROR_RESPONSE = 0b1111;
    private static final byte NO_SEQUENCE = 0b0000;
    private static final byte POS_SEQUENCE = 0b0001;
    private static final byte NEG_WITH_SEQUENCE = 0b0011;
    private static final byte NO_SERIALIZATION = 0b0000;
    private static final byte JSON = 0b0001;
    private static final byte NO_COMPRESSION = 0b0000;
    private static final byte GZIP = 0b0001;

    private final OkHttpClient client;
    private final ExecutorService executor;
    private final Handler mainHandler;
    private AudioRecord currentRecorder;
    private volatile boolean isRecording = false; // 控制录音状态
    private volatile boolean isWebSocketConnected = false; // 跟踪 WebSocket 连接状态

    public VolcengineSpeechToTextConverter() {
        this.client = new OkHttpClient.Builder()
                .pingInterval(50, TimeUnit.SECONDS)
                .readTimeout(100, TimeUnit.SECONDS)
                .writeTimeout(100, TimeUnit.SECONDS)
                .build();
        this.executor = Executors.newSingleThreadExecutor();
        this.mainHandler = new Handler(Looper.getMainLooper());
    }

    public void stopRealTimeCapture() {
        isRecording = false;
        isWebSocketConnected = false;
        if (currentRecorder != null) {
            try {
                currentRecorder.stop();
                currentRecorder.release();
            } catch (Exception e) {
                Log.d(TAG, "停止录音失败: " + e.getMessage());
            } finally {
                currentRecorder = null;
            }
        }
    }

    public void startRealTimeAudioCapture(SpeechToTextCallback callback) {
        int sampleRate = 16000; // 16kHz
        int channelConfig = AudioFormat.CHANNEL_IN_MONO;
        int audioFormat = AudioFormat.ENCODING_PCM_16BIT;
        int bufferSize = AudioRecord.getMinBufferSize(sampleRate, channelConfig, audioFormat);

        AudioRecord recorder = new AudioRecord(
                MediaRecorder.AudioSource.MIC,
                sampleRate,
                channelConfig,
                audioFormat,
                bufferSize
        );

        recorder.startRecording();
        currentRecorder = recorder;
        isRecording = true;

        // 创建 WebSocket 连接
        Request request = new Request.Builder()
                .url(URL)
                .header("X-Api-App-Key", APP_ID)
                .header("X-Api-Access-Key", TOKEN)
                .header("X-Api-Resource-Id", "volc.bigasr.sauc.duration")
                .header("X-Api-Connect-Id", UUID.randomUUID().toString())
                .build();
//        PcmWebSocketListener listener = new PcmWebSocketListener(callback); // PCM 数据通过流式提供
//        WebSocket webSocket = client.newWebSocket(request, listener);

        // 将 callback 和 recorder 传递给 Listener
        PcmWebSocketListener listener = new PcmWebSocketListener(callback, currentRecorder);
        client.newWebSocket(request, listener); // 异步连接

        // 标记录音状态(实际录音线程在 onOpen 中启动)
        isRecording = true;
    }



    private void notifyError(SpeechToTextCallback callback, String errorMessage) {
        mainHandler.post(() -> callback.onError(errorMessage));
    }

    private void notifySuccess(SpeechToTextCallback callback, String transcript) {
        mainHandler.post(() -> callback.onSuccess(transcript));
    }

    private class PcmWebSocketListener extends WebSocketListener {
        private final SpeechToTextCallback callback;
        private WebSocket webSocket;
        private final AudioRecord recorder;
        private int seq = 1;
        private final List<String> transcripts = new ArrayList<>();

        public PcmWebSocketListener(SpeechToTextCallback callback,AudioRecord recorder) {
            this.callback = callback;
            this.recorder = recorder;
        }

        private JsonObject createInitPayload(int sampleRate, int channels, int bits) {
            JsonObject user = new JsonObject();
            user.addProperty("uid", "test");

            JsonObject audio = new JsonObject();
            audio.addProperty("format", "pcm");
            audio.addProperty("sample_rate", sampleRate);
            audio.addProperty("bits", bits);
            audio.addProperty("channel", channels);
            audio.addProperty("codec", "raw");

            JsonObject request = new JsonObject();
            request.addProperty("model_name", "bigmodel");
            request.addProperty("enable_punc", true);
            JsonObject payload = new JsonObject();
            payload.add("user", user);
            payload.add("audio", audio);
            payload.add("request", request);
            return payload;
        }

        @Override
        public void onOpen(WebSocket webSocket, Response response) {
            this.webSocket = webSocket;
            isWebSocketConnected = true;
            Log.d(TAG, "===> WebSocket连接已建立");

            JsonObject payload = createInitPayload(16000, 1, 16);
            sendInitialRequest(webSocket, payload);
            seq = 2; // 初始请求后,序列号设置为 2
            Log.d(TAG, "onOpen set seq" + seq);

            // 启动录音并发送数据
            executor.execute(() -> {
                try {
                    recorder.startRecording();
                    byte[] buffer = new byte[3200]; // 100ms 数据
                    // 等待 WebSocket 连接建立
                    long startTime = System.currentTimeMillis();
                    while (!isWebSocketConnected && System.currentTimeMillis() - startTime < 5000) {
                        Thread.sleep(50); // 等待最多 5 秒
                    }
                    if (!isWebSocketConnected) {
                        notifyError(callback, "WebSocket 连接超时");
                        stopRealTimeCapture();
                        return;
                    }

                    while (isRecording && recorder.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING) {
                        int bytesRead = recorder.read(buffer, 0, buffer.length);
                        if (bytesRead > 0) {
                            sendPcmChunk(buffer, bytesRead, false);
                        } else if (bytesRead < 0) {
                            // 处理读取错误
                            break;
                        }
                    }
                    // 修改最后一包的发送逻辑
                    byte[] validEmptyGzip = new byte[]{
                            0x1f, (byte)0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
                    };
                    sendPcmChunk(validEmptyGzip, validEmptyGzip.length, true);  // 发送合法的空GZIP
                    // 发送最后一包
                } catch (Exception e) {
                    notifyError(callback, "录音或数据发送失败: " + e.getMessage());
                } finally {
                    stopRealTimeCapture();
                }
            });
        }

            public void sendPcmChunk(byte[] chunk, int length, boolean isLast) {
            if (webSocket == null) {
                Log.d(TAG, "===> webSocket == null");
                notifyError(callback, "WebSocket 未连接");
                stopRealTimeCapture();
                return;
            }
            try {
                byte messageTypeSpecificFlags = isLast ? NEG_WITH_SEQUENCE : POS_SEQUENCE;
                byte[] header = getHeader(AUDIO_ONLY_REQUEST, messageTypeSpecificFlags, JSON, GZIP, (byte) 0);
                if (isLast) {
                    seq = -seq;
                }
                byte[] sequenceBytes = generateBeforPayload(seq);
                byte[] compressedChunk = gzipCompress(chunk, length);
                byte[] payloadSize = intToBytes(compressedChunk.length);

                byte[] message = new byte[header.length + sequenceBytes.length + payloadSize.length + compressedChunk.length];
                System.arraycopy(header, 0, message, 0, header.length);
                System.arraycopy(sequenceBytes, 0, message, header.length, sequenceBytes.length);
                System.arraycopy(payloadSize, 0, message, header.length + sequenceBytes.length, payloadSize.length);
                System.arraycopy(compressedChunk, 0, message, header.length + sequenceBytes.length + payloadSize.length, compressedChunk.length);
//                Log.d(TAG, "发送 PCM 数据块,序列号: " + seq + ", 是否最后一包: " + isLast);
//                System.out.println("payload: " + bytesToHexString(compressedChunk));
                // 仅在非最后一包时递增 seq
                if (!isLast) {
                    seq++;
                }
                webSocket.send(ByteString.of(message));

            } catch (Exception e) {
                notifyError(callback, "数据传输失败: " + e.getMessage());
                cleanup();
            }
        }

        private void sendInitialRequest(WebSocket webSocket, JsonObject payload) {
            try {
                String payloadStr = payload.toString();
                byte[] payloadBytes = gzipCompress(payloadStr.getBytes());
                byte[] header = getHeader(FULL_CLIENT_REQUEST, POS_SEQUENCE, JSON, GZIP, (byte) 0);
                byte[] payloadSize = intToBytes(payloadBytes.length);
                byte[] sequenceBytes = generateBeforPayload(1);

                byte[] fullRequest = new byte[header.length + sequenceBytes.length + payloadSize.length + payloadBytes.length];
                int destPos = 0;
                System.arraycopy(header, 0, fullRequest, destPos, header.length);
                destPos += header.length;
                System.arraycopy(sequenceBytes, 0, fullRequest, destPos, sequenceBytes.length);
                destPos += sequenceBytes.length;
                System.arraycopy(payloadSize, 0, fullRequest, destPos, payloadSize.length);
                destPos += payloadSize.length;
                System.arraycopy(payloadBytes, 0, fullRequest, destPos, payloadBytes.length);

                boolean success = webSocket.send(ByteString.of(fullRequest));
                if (!success) {
                    throw new IOException("发送初始请求失败");
                }
            } catch (Exception e) {
                System.out.println("===> 初始请求发送异常: " + e.getMessage());
                notifyError(callback, "初始化失败: " + e.getMessage());
                cleanup();
            }
        }

        @Override
        public void onMessage(WebSocket webSocket, String text) {
            System.out.println("===> 收到文本消息: " + text);
        }

        @Override
        public void onMessage(WebSocket webSocket, ByteString bytes) {
            handleBinaryMessage(bytes.toByteArray());
        }

        @Override
        public void onFailure(WebSocket webSocket, Throwable t, Response response) {
            isWebSocketConnected = false;
            System.out.println("===> WebSocket失败: " + t.getMessage());
            notifyError(callback, "连接失败: " + t.getMessage());
            cleanup();
        }

        @Override
        public void onClosing(WebSocket webSocket, int code, String reason) {
            isWebSocketConnected = false;
            System.out.println("===> 连接关闭中: " + code + " - " + reason);
            webSocket.close(1000, null);
            cleanup();
        }

        @Override
        public void onClosed(WebSocket webSocket, int code, String reason) {
            isWebSocketConnected = false;
            System.out.println("===> 连接已关闭: " + code + " - " + reason);
            cleanup();
        }

        private void handleBinaryMessage(byte[] res) {
            ParserResponseResult responseResult = parserResponse(res);
            int sequence = responseResult.sequence;
            String text = responseResult.text;
            boolean isLastPackage = sequence < 0;
            if (isLastPackage) {
                if (text != null) {
                    transcripts.clear();
                    transcripts.add(text);
                }
                System.out.println("===> 识别完成");
                Log.d(TAG,"===> 识别完成 transcripts 结果: " + transcripts.toString());
                notifySuccess(callback, String.join(" ", transcripts));
                webSocket.close(1000, "finished");
                cleanup();
            }
        }

        private void cleanup() {
            isWebSocketConnected = false;
            if (webSocket != null) {
                try {
                    webSocket.close(1000, "normal closure");
                } catch (Exception e) {
                    System.out.println("===> WebSocket关闭异常: " + e.getMessage());
                }
                webSocket = null;
            }
        }
        // 其他方法(如 onMessage、onFailure 等)保持不变
    }

    private String bytesToHexString(byte[] bytes) {
        StringBuilder sb = new StringBuilder();
        for (byte b : bytes) {
            sb.append(String.format("%02X ", b));
        }
        return sb.toString();
    }

    class ParserResponseResult {
        int sequence;
        String text;

        public ParserResponseResult(int sequence, String text) {
            this.sequence = sequence;
            this.text = text;
        }
    }
    static byte[] getHeader(byte messageType, byte messageTypeSpecificFlags, byte serialMethod, byte compressionType, byte reservedData) {
        final byte[] header = new byte[4];
        header[0] = (byte) ((PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE);
        header[1] = (byte) ((messageType << 4) | messageTypeSpecificFlags);
        header[2] = (byte) ((serialMethod << 4) | compressionType);
        header[3] = reservedData;
        return header;
    }

    static byte[] intToBytes(int a) {
        return new byte[]{
                (byte) ((a >> 24) & 0xFF),
                (byte) ((a >> 16) & 0xFF),
                (byte) ((a >> 8) & 0xFF),
                (byte) (a & 0xFF)
        };
    }

    static int bytesToInt(byte[] src) {
        if (src == null || src.length != 4) {
            throw new IllegalArgumentException("无效的字节数组");
        }
        return ((src[0] & 0xFF) << 24) |
                ((src[1] & 0xFF) << 16) |
                ((src[2] & 0xFF) << 8) |
                (src[3] & 0xFF);
    }

    static byte[] generateBeforPayload(int seq) {
        return intToBytes(seq);
    }

    static byte[] gzipCompress(byte[] src) {
        return gzipCompress(src, src.length);
    }

    static byte[] gzipCompress(byte[] src, int len) {
        if (src == null || len == 0) {
            return new byte[0];
        }
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try (GZIPOutputStream gzip = new GZIPOutputStream(out)) {
            gzip.write(src, 0, len);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return out.toByteArray();
    }

    static byte[] gzipDecompress(byte[] src) {
        if (src == null || src.length == 0) {
            return null;
        }
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try (ByteArrayInputStream ins = new ByteArrayInputStream(src);
             GZIPInputStream gzip = new GZIPInputStream(ins)) {
            byte[] buffer = new byte[1024];
            int len;
            while ((len = gzip.read(buffer)) > 0) {
                out.write(buffer, 0, len);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return out.toByteArray();
    }

    ParserResponseResult  parserResponse(byte[] res) {
        if (res == null || res.length == 0) {
            return new ParserResponseResult(-1, null);
        }
        final byte num = 0b00001111;
        Map<String, Object> result = new HashMap<>();
        int protocol_version = (res[0] >> 4) & num;
        result.put("protocol_version", protocol_version);
        int header_size = res[0] & 0x0f;
        result.put("header_size", header_size);

        int message_type = (res[1] >> 4) & num;
        result.put("message_type", message_type);
        int message_type_specific_flags = res[1] & 0x0f;
        result.put("message_type_specific_flags", message_type_specific_flags);
        int serialization_method = res[2] >> 4;
        result.put("serialization_method", serialization_method);
        int message_compression = res[2] & 0x0f;
        result.put("message_compression", message_compression);
        int reserved = res[3];
        result.put("reserved", reserved);

        byte[] temp = new byte[4];
        System.arraycopy(res, 4, temp, 0, temp.length);
        int sequence = bytesToInt(temp);

        System.arraycopy(res, 8, temp, 0, temp.length);
        int payloadSize = bytesToInt(temp);
        byte[] payload = new byte[res.length - 12];
        System.arraycopy(res, 12, payload, 0, payload.length);

        String payloadStr = null;
        if (message_type == FULL_SERVER_RESPONSE || message_type == SERVER_ACK) {
            payloadStr = message_compression == GZIP ? new String(gzipDecompress(payload)) : new String(payload);
//            System.out.println("===>payload:" + payloadStr);
            result.put("payload_size", payloadSize);
//            System.out.println("===>response:" + new Gson().toJson(result));
        } else if (message_type == SERVER_ERROR_RESPONSE) {
            payloadStr = new String(payload);
            result.put("code", sequence);
            result.put("error_msg", payloadStr);
            System.out.println("===>response:" + new Gson().toJson(result));
        }
        String text = null;
        if((message_type_specific_flags==3) &&(message_type == FULL_SERVER_RESPONSE)) {
            Gson gson = new Gson();
            JsonObject jsonObject = gson.fromJson(payloadStr, JsonObject.class);
            JsonObject resultObj = jsonObject.getAsJsonObject("result");
            if (resultObj != null) {
                text = resultObj.get("text").getAsString();
                System.out.println("===>response:text" + text);
                sequence = -1;
            }
        }
        return new ParserResponseResult(sequence, text);
    }
}

Logo

欢迎加入 MCP 技术社区!与志同道合者携手前行,一同解锁 MCP 技术的无限可能!

更多推荐