stlukey · mico-boje · Mar 6, 2023 · Mar 6, 2023 · Mar 6, 2023 · Mar 6, 2023
diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
@@ -3,10 +3,13 @@ run-name: ${{ github.actor }} is building wheels
 on: [push]
 jobs:
   build_wheels:
-    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest]
+        # os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ["3.10"]
+        #xcode: [13.2]
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2
       - name: Checkout submodules
@@ -30,6 +33,15 @@ jobs:
                fi
         shell: bash
 
+      - name: Install MSBuild
+        if: runner.os == 'windows'
+        uses: microsoft/[email protected]
+
+      - name: Set up C++ environment
+        if: runner.os == 'windows'
+        run: |
+              "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x86
+
       - name: Build wheel
         run: python -m cibuildwheel --output-dir dist/
         env:

diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,10 @@ __pycache__/
 # C extensions
 *.so
 
+# Prroject specific
+/local
+whispercpp.cpp
+
 # Distribution / packaging
 .Python
 build/

diff --git a/README.md b/README.md
@@ -1,8 +1,22 @@
 Python bindings for whisper.cpp
 ===============================
+This project provides Python bindings for the whisper.cpp library, which is a C++ implementation of a speech-to-text engine. The bindings are implemented in Cython, a language that allows easy integration between Python and C/C++ code.
 
+# Installation
 `pip install git+https://github.com/o4dev/whispercpp.py`
+# Examples
+Once the package is installed, you can use it to transcribe speech files. Here's an example:
+```python
+from whispercpp import Whisper
+
+w = Whisper('tiny')
+
+result = w.transcribe("myfile.mp3", language='en')
+text = w.extract_text(result)
+```
+This code creates a Whisper object using the 'tiny' model and transcribes the audio file "myfile.mp3" using English language. The resulting result object is a byte string that can be decoded into a text string using the extract_text method.
 
+If you don't specify a language, Whisper will try to determine the language of the audio:
 ```python
 from whispercpp import Whisper
 
@@ -14,3 +28,25 @@ text = w.extract_text(result)
 
 Note: default parameters might need to be tweaked.
 See Whispercpp.pyx.
+
+# Available Models
+The following models are available for use with Whispercpp.py:
+| Model     | Disk    | Mem       | SHA                                                               |
+|-----------|---------|-----------|------------------------------------------------------------------|
+| tiny      | 75 MB   | ~390 MB   | bd577a113a864445d4c299885e0cb97d4ba92b5f                          |
+| tiny.en   | 75 MB   | ~390 MB   | c78c86eb1a8faa21b369bcd33207cc90d64ae9df                          |
+| base      | 142 MB  | ~500 MB   | 465707469ff3a37a2b9b8d8f89f2f99de7299dac                          |
+| base.en   | 142 MB  | ~500 MB   | 137c40403d78fd54d454da0f9bd998f78703390c                          |
+| small     | 466 MB  | ~1.0 GB   | 55356645c2b361a969dfd0ef2c5a50d530afd8d5                          |
+| small.en  | 466 MB  | ~1.0 GB   | db8a495a91d927739e50b3fc1cc4c6b8f6c2d022                          |
+| medium    | 1.5 GB  | ~2.6 GB   | fd9727b6e1217c2f614f9b698455c4ffd82463b4                          |
+| medium.en | 1.5 GB  | ~2.6 GB   | 8c30f0e44ce9560643ebd10bbe50cd20eafd3723                          |
+| large-v1  | 2.9 GB  | ~4.7 GB   | b1caaf735c4cc1429223d5a74f0f4d0b9b59a299                          |
+| large     | 2.9 GB  | ~4.7 GB   | 0f4c8e34f21cf1a914c59d8b3ce882345ad349d6                          |
+
+To use a specific model with Whispercpp.py, specify the model name when creating a Whisper object:
+```python
+from whispercpp import Whisper
+
+w = Whisper('base.en')
+```
diff --git a/whisper.cpp b/whisper.cpp
diff --git a/whispercpp.pxd b/whispercpp.pxd
@@ -71,7 +71,7 @@ cdef extern from "whisper.h" nogil:
         whisper_encoder_begin_callback encoder_begin_callback
         void* encoder_begin_callback_user_data
     whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
-    cdef whisper_context* whisper_init(char*)
+    cdef whisper_context* whisper_init_from_file(char*)
     cdef void whisper_free(whisper_context*)
     cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
     cdef int whisper_set_mel(whisper_context*, float*, int, int)

diff --git a/whispercpp.pyx b/whispercpp.pyx
@@ -7,34 +7,47 @@ import requests
 import os
 from pathlib import Path
 
-MODELS_DIR = str(Path('~/.ggml-models').expanduser())
+MODELS_DIR = str(Path('~/.cache/ggml-models').expanduser())
 print("Saving models to:", MODELS_DIR)
 
 
 cimport numpy as cnp
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 cdef int SAMPLE_RATE = 16000
 cdef char* TEST_FILE = 'test.wav'
 cdef char* DEFAULT_MODEL = 'tiny'
-cdef char* LANGUAGE = b'fr'
+cdef char* LANGUAGE = NULL
 cdef int N_THREADS = os.cpu_count()
 
 MODELS = {
     'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
+    'ggml-tiny.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin',
     'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
+    'ggml-base.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin',
     'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
+    'ggml-small.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin',
     'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
+    'ggml-medium.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin',
     'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
+    'ggml-large-v1.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin',
 }
 
 def model_exists(model):
     return os.path.exists(Path(MODELS_DIR).joinpath(model))
 
+def sampling_strategy_from_string(strategy_string):
+    strategy_map = {
+        'GREEDY': whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH,
+        'BEAM_SEARCH': whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH
+    }
+    return strategy_map[strategy_string.upper()]
+
 def download_model(model):
     if model_exists(model):
         return
-
     print(f'Downloading {model}...')
+
     url = MODELS[model]
     r = requests.get(url, allow_redirects=True)
     os.makedirs(MODELS_DIR, exist_ok=True)
@@ -43,6 +56,7 @@ def download_model(model):
 
 
 cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE):
+    print("Sampling rate:", sr)
     try:
         out = (
             ffmpeg.input(file, threads=0)
@@ -57,7 +71,7 @@ cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr
                 capture_stderr=True
             )
         )[0]
-    except:
+    except FileNotFoundError:
         raise RuntimeError(f"File '{file}' not found")
 
     cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
@@ -68,9 +82,10 @@ cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr
 
     return frames
 
-cdef whisper_full_params default_params() nogil:
+cdef whisper_full_params default_params(strategy='GREEDY'):
+    strategy_value = sampling_strategy_from_string(strategy)
     cdef whisper_full_params params = whisper_full_default_params(
-        whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+        strategy_value
     )
     params.print_realtime = True
     params.print_progress = True
@@ -81,27 +96,57 @@ cdef whisper_full_params default_params() nogil:
 
 
 cdef class Whisper:
+    """
+    This class provides an interface for speech recognition using the Whisper library.
+
+    Parameters:
+    -----------
+    model (str): Model to use for transcription. One of ['ggml-tiny', 'ggml-tiny.en', 'ggml-base',
+            'ggml-base.en', 'ggml-small', 'ggml-small.en', 'ggml-medium', 'ggml-medium.en', 'ggml-large',
+            'ggml-large-v1']. Defaults to 'ggml-base'.
+    **kwargs: optional
+        Additional arguments to override the default parameters for speech recognition. Accepts the following arguments:
+            - strategy (str): Sampling strategy to use. Choose from 'GREEDY' or 'BEAM_SEARCH'. Default: 'GREEDY'.
+            - print_progress (bool): Whether to print progress messages during transcription. Default: True.
+            - print_realtime (bool): Whether to print transcription results in real time. Default: True.
+
+    Attributes:
+    -----------
+    ctx: whisper_context *
+        The pointer to the Whisper context used for speech recognition.
+    params: whisper_full_params
+        The parameters used for speech recognition.
+    """
     cdef whisper_context * ctx
     cdef whisper_full_params params
 
-    def __init__(self, model=DEFAULT_MODEL, pb=None):
-        model_fullname = f'ggml-{model}.bin'.encode('utf8')
+    def __init__(self, model='tiny', **kwargs):
+        model_fullname = f'ggml-{model}.bin'
         download_model(model_fullname)
         model_path = Path(MODELS_DIR).joinpath(model_fullname)
         cdef bytes model_b = str(model_path).encode('utf8')
-        self.ctx = whisper_init(model_b)
-        self.params = default_params()
+        self.ctx = whisper_init_from_file(model_b)
+        self.params = default_params(kwargs.get('strategy', 'GREEDY'))
         whisper_print_system_info()
+        # Override default params
+        self.params.print_progress = kwargs.get('print_progress', True)
+        self.params.print_realtime = kwargs.get('print_realtime', True)
+
 
     def __dealloc__(self):
         whisper_free(self.ctx)
 
-    def transcribe(self, filename=TEST_FILE):
-        print("Loading data..")
-        cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
-
-        print("Transcribing..")
-        return whisper_full(self.ctx, self.params, &frames[0], len(frames))
+    def transcribe(self, filename=TEST_FILE, language=None):
+        print("Transcribing...")
+        cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename, SAMPLE_RATE)
+        if language:
+            print("Language:", language)
+            LANGUAGE = language.encode('utf-8')
+            self.params.language = LANGUAGE
+        else:
+            self.params.language = NULL
+        transcript = whisper_full(self.ctx, self.params, &frames[0], len(frames))
+        return transcript
 
     def extract_text(self, int res):
         print("Extracting text...")