from __future__ import annotations import os import warnings from ctypes import ( CFUNCTYPE, c_bool, c_char_p, c_int, c_int64, c_uint8, c_uint32, c_size_t, c_float, c_void_p, POINTER, _Pointer, # type: ignore Structure, byref, ) import pathlib from typing import ( Union, NewType, Optional, TYPE_CHECKING, ) import llama_cpp.llama_cpp as llama_cpp from llama_cpp._ctypes_extensions import ( load_shared_library, ctypes_function_for_shared_library, ) if TYPE_CHECKING: from llama_cpp._ctypes_extensions import ( CtypesArray, ) # Specify the base name of the shared library to load _libmtmd_base_name = "mtmd" _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB") _libmtmd_base_path = ( pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path(_libmtmd_override_path) ) # Load the library _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path) ctypes_function = ctypes_function_for_shared_library(_libmtmd) ################################################ # mtmd.h types ################################################ # Opaque types mtmd_context_p = NewType("mtmd_context_p", int) mtmd_context_p_ctypes = c_void_p mtmd_bitmap_p = NewType("mtmd_bitmap_p", int) mtmd_bitmap_p_ctypes = c_void_p mtmd_helper_video_p = NewType("mtmd_helper_video_p", int) mtmd_helper_video_p_ctypes = c_void_p mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int) mtmd_image_tokens_p_ctypes = c_void_p mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int) mtmd_input_chunk_p_ctypes = c_void_p mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p # Enums MTMD_INPUT_CHUNK_TYPE_TEXT = 0 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 # Structures class mtmd_context_params(Structure): """Context parameters for MTMD initialization. `image_marker` is deprecated upstream and kept for compatibility; use `media_marker` for multimodal prompt placeholders. """ if TYPE_CHECKING: use_gpu: bool print_timings: bool n_threads: int image_marker: Optional[bytes] media_marker: Optional[bytes] flash_attn_type: int warmup: bool image_min_tokens: int image_max_tokens: int cb_eval: llama_cpp.ggml_backend_sched_eval_callback cb_eval_user_data: c_void_p _fields_ = [ ("use_gpu", c_bool), ("print_timings", c_bool), ("n_threads", c_int), ("image_marker", c_char_p), ("media_marker", c_char_p), ("flash_attn_type", c_int), ("warmup", c_bool), ("image_min_tokens", c_int), ("image_max_tokens", c_int), ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), ] class mtmd_input_text(Structure): """Text input passed to `mtmd_tokenize`.""" _fields_ = [ ("text", c_char_p), ("add_special", c_bool), ("parse_special", c_bool), ] class mtmd_decoder_pos(Structure): """Decoder attention position for M-RoPE models.""" _fields_ = [ ("t", c_uint32), ("x", c_uint32), ("y", c_uint32), ("z", c_uint32), ] # struct mtmd_caps { # bool inp_vision; # bool inp_audio; # }; class mtmd_caps(Structure): """Capabilities exposed by an mmproj file.""" if TYPE_CHECKING: inp_vision: bool inp_audio: bool _fields_ = [ ("inp_vision", c_bool), ("inp_audio", c_bool), ] mtmd_bitmap_lazy_callback = CFUNCTYPE( c_int, c_size_t, c_void_p, POINTER(mtmd_bitmap_p_ctypes), POINTER(c_char_p), ) class mtmd_helper_bitmap_wrapper(Structure): """Bitmap wrapper returned by MTMD helper media loaders.""" if TYPE_CHECKING: bitmap: Optional[mtmd_bitmap_p] video_ctx: Optional[mtmd_helper_video_p] _fields_ = [ ("bitmap", mtmd_bitmap_p_ctypes), ("video_ctx", mtmd_helper_video_p_ctypes), ] class mtmd_helper_video_info(Structure): """Metadata for a decoded video stream.""" if TYPE_CHECKING: width: int height: int fps: float n_frames: int _fields_ = [ ("width", c_uint32), ("height", c_uint32), ("fps", c_float), ("n_frames", c_int), ] class mtmd_helper_video_init_params(Structure): """Parameters for initializing an MTMD helper video stream.""" if TYPE_CHECKING: fps_target: float ffmpeg_bin_dir: Optional[bytes] timestamp_interval_ms: int _fields_ = [ ("fps_target", c_float), ("ffmpeg_bin_dir", c_char_p), ("timestamp_interval_ms", c_int64), ] ################################################ # mtmd.h functions ################################################ # MTMD_API const char * mtmd_default_marker(void); @ctypes_function("mtmd_default_marker", [], c_char_p) def mtmd_default_marker() -> bytes: """Return the default media marker.""" ... # MTMD_API struct mtmd_context_params mtmd_context_params_default(void); @ctypes_function("mtmd_context_params_default", [], mtmd_context_params) def mtmd_context_params_default() -> mtmd_context_params: """Return the default MTMD context parameters.""" ... # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, # const struct llama_model * text_model, # const struct mtmd_context_params ctx_params); @ctypes_function( "mtmd_init_from_file", [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params], mtmd_context_p_ctypes, ) def mtmd_init_from_file( mmproj_fname: bytes, text_model: llama_cpp.llama_model_p, ctx_params: mtmd_context_params, /, ) -> Optional[mtmd_context_p]: """Initialize the MTMD context from a projector file. Returns None on failure.""" ... # MTMD_API void mtmd_free(mtmd_context * ctx); @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None) def mtmd_free(ctx: mtmd_context_p, /): ... # MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk); @ctypes_function( "mtmd_decode_use_non_causal", [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], c_bool, ) def mtmd_decode_use_non_causal( ctx: mtmd_context_p, chunk: Optional[mtmd_input_chunk_p], / ) -> bool: """Check whether MTMD decoding uses non-causal attention.""" ... # MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx); @ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool) def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD decoding uses mRoPE.""" ... # MTMD_API bool mtmd_support_vision(const mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: """Check whether the current model supports vision input.""" ... # MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); @ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool) def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD supports audio.""" ... # MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int: """Get the audio sample rate in Hz. Returns -1 if audio is not supported.""" ... # MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx); @ctypes_function("mtmd_get_marker", [mtmd_context_p_ctypes], c_char_p) def mtmd_get_marker(ctx: mtmd_context_p, /) -> Optional[bytes]: """Get the current media marker string.""" ... # Deprecated compatibility wrapper for the renamed mtmd_get_audio_sample_rate(). def mtmd_get_audio_bitrate(ctx: mtmd_context_p, /) -> int: warnings.warn( "mtmd_get_audio_bitrate is deprecated; use mtmd_get_audio_sample_rate instead", DeprecationWarning, stacklevel=2, ) return mtmd_get_audio_sample_rate(ctx) # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function( "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes ) def mtmd_bitmap_init( nx: Union[c_uint32, int], ny: Union[c_uint32, int], data: CtypesArray[c_uint8], /, ) -> Optional[mtmd_bitmap_p]: ... # MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); @ctypes_function( "mtmd_bitmap_init_from_audio", [c_size_t, POINTER(c_float)], mtmd_bitmap_p_ctypes, ) def mtmd_bitmap_init_from_audio( n_samples: Union[c_size_t, int], data: CtypesArray[c_float], /, ) -> Optional[mtmd_bitmap_p]: """Initialize an MTMD bitmap from audio samples.""" ... # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None) def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ... # MTMD_API uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_get_nx", [mtmd_bitmap_p_ctypes], c_uint32) def mtmd_bitmap_get_nx(bitmap: mtmd_bitmap_p, /) -> int: """Get the bitmap width in pixels.""" ... # MTMD_API uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_get_ny", [mtmd_bitmap_p_ctypes], c_uint32) def mtmd_bitmap_get_ny(bitmap: mtmd_bitmap_p, /) -> int: """Get the bitmap height in pixels.""" ... # MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_get_data", [mtmd_bitmap_p_ctypes], POINTER(c_uint8)) def mtmd_bitmap_get_data(bitmap: mtmd_bitmap_p, /) -> Optional[CtypesArray[c_uint8]]: """Get the raw bitmap data buffer.""" ... # MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_get_n_bytes", [mtmd_bitmap_p_ctypes], c_size_t) def mtmd_bitmap_get_n_bytes(bitmap: mtmd_bitmap_p, /) -> int: """Get the bitmap data size in bytes.""" ... # MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_is_audio", [mtmd_bitmap_p_ctypes], c_bool) def mtmd_bitmap_is_audio(bitmap: mtmd_bitmap_p, /) -> bool: """Check whether the bitmap contains audio data.""" ... # MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_get_id", [mtmd_bitmap_p_ctypes], c_char_p) def mtmd_bitmap_get_id(bitmap: mtmd_bitmap_p, /) -> Optional[bytes]: """Get the optional bitmap identifier.""" ... # MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); @ctypes_function("mtmd_bitmap_set_id", [mtmd_bitmap_p_ctypes, c_char_p], None) def mtmd_bitmap_set_id(bitmap: mtmd_bitmap_p, id: Optional[bytes], /): """Set the optional bitmap identifier.""" ... # MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, # const char * id, # void * user_data, # mtmd_bitmap_lazy_callback callback); @ctypes_function( "mtmd_bitmap_init_lazy", [mtmd_context_p_ctypes, c_char_p, c_void_p, mtmd_bitmap_lazy_callback], mtmd_bitmap_p_ctypes, ) def mtmd_bitmap_init_lazy( ctx: mtmd_context_p, id: Optional[bytes], user_data: c_void_p, callback: mtmd_bitmap_lazy_callback, /, ) -> Optional[mtmd_bitmap_p]: """Initialize a lazy MTMD bitmap.""" ... # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes) def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ... # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None) def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ... # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t) def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ... # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx); @ctypes_function( "mtmd_input_chunks_get", [mtmd_input_chunks_p_ctypes, c_size_t], mtmd_input_chunk_p_ctypes, ) def mtmd_input_chunks_get( chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], / ) -> Optional[mtmd_input_chunk_p]: ... # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, # mtmd_input_chunks * output, # const mtmd_input_text * text, # const mtmd_bitmap ** bitmaps, # size_t n_bitmaps); @ctypes_function( "mtmd_tokenize", [ mtmd_context_p_ctypes, mtmd_input_chunks_p_ctypes, POINTER(mtmd_input_text), POINTER(mtmd_bitmap_p_ctypes), c_size_t, ], c_int, ) def mtmd_tokenize( ctx: mtmd_context_p, output: mtmd_input_chunks_p, text: "_Pointer[mtmd_input_text]", bitmaps: CtypesArray[mtmd_bitmap_p_ctypes], n_bitmaps: Union[c_size_t, int], /, ) -> int: ... # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ... # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int) def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ... # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output); @ctypes_function( "mtmd_input_chunk_get_tokens_text", [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)], POINTER(llama_cpp.llama_token), ) def mtmd_input_chunk_get_tokens_text( chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", / ) -> Optional["_Pointer[llama_cpp.llama_token]"]: ... # MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk); @ctypes_function( "mtmd_input_chunk_get_tokens_image", [mtmd_input_chunk_p_ctypes], mtmd_image_tokens_p_ctypes, ) def mtmd_input_chunk_get_tokens_image( chunk: mtmd_input_chunk_p, / ) -> Optional[mtmd_image_tokens_p]: ... # MTMD_API const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_id", [mtmd_input_chunk_p_ctypes], c_char_p) def mtmd_input_chunk_get_id(chunk: mtmd_input_chunk_p, /) -> Optional[bytes]: """Get the optional chunk identifier.""" ... # MTMD_API llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk); @ctypes_function( "mtmd_input_chunk_get_n_pos", [mtmd_input_chunk_p_ctypes], llama_cpp.llama_pos, ) def mtmd_input_chunk_get_n_pos(chunk: mtmd_input_chunk_p, /) -> int: """Get the number of positions consumed by the chunk.""" ... # MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); @ctypes_function( "mtmd_input_chunk_copy", [mtmd_input_chunk_p_ctypes], mtmd_input_chunk_p_ctypes ) def mtmd_input_chunk_copy(chunk: mtmd_input_chunk_p, /) -> Optional[mtmd_input_chunk_p]: """Copy an input chunk and transfer ownership to the caller.""" ... # MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_free", [mtmd_input_chunk_p_ctypes], None) def mtmd_input_chunk_free(chunk: mtmd_input_chunk_p, /): """Free an owned input chunk.""" ... # MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); @ctypes_function( "mtmd_image_tokens_get_n_tokens", [mtmd_image_tokens_p_ctypes], c_size_t ) def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the number of image tokens.""" ... # DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens), # "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid width.""" ... # DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens), # "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid height.""" ... # MTMD_API const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); @ctypes_function("mtmd_image_tokens_get_id", [mtmd_image_tokens_p_ctypes], c_char_p) def mtmd_image_tokens_get_id(image_tokens: mtmd_image_tokens_p, /) -> Optional[bytes]: """Get the optional image token identifier.""" ... # MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); @ctypes_function( "mtmd_image_tokens_get_n_pos", [mtmd_image_tokens_p_ctypes], llama_cpp.llama_pos, ) def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the number of positions consumed by the image tokens.""" ... # MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos( # const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i); @ctypes_function( "mtmd_image_tokens_get_decoder_pos", [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, c_size_t], mtmd_decoder_pos, ) def mtmd_image_tokens_get_decoder_pos( image_tokens: mtmd_image_tokens_p, pos_0: llama_cpp.llama_pos, i: Union[c_size_t, int], /, ) -> mtmd_decoder_pos: """Get decoder attention position for an image embedding token.""" ... # MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); @ctypes_function( "mtmd_encode", [mtmd_context_p_ctypes, mtmd_image_tokens_p_ctypes], c_int, ) def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int: """Run an MTMD encode pass for image tokens.""" ... # MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk); @ctypes_function( "mtmd_encode_chunk", [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], c_int, ) def mtmd_encode_chunk(ctx: mtmd_context_p, chunk: mtmd_input_chunk_p, /) -> int: """Run an MTMD encode pass for a single chunk.""" ... # MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); @ctypes_function("mtmd_get_output_embd", [mtmd_context_p_ctypes], POINTER(c_float)) def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float]]: """Get output embeddings from the last encode pass.""" ... # MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); @ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: """Get mmproj capabilities without initializing a full MTMD context.""" ... # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes) def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]: """Create MTMD test chunks for the C API tests.""" ... ################################################ # mtmd-helper.h functions ################################################ # MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); @ctypes_function( "mtmd_helper_support_video", [mtmd_context_p_ctypes], c_bool, ) def mtmd_helper_support_video(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD helper video support is available.""" ... # MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function( "mtmd_helper_bitmap_init_from_file", [mtmd_context_p_ctypes, c_char_p, c_bool], mtmd_helper_bitmap_wrapper, ) def mtmd_helper_bitmap_init_from_file_wrapper( ctx: mtmd_context_p, fname: bytes, placeholder: Union[c_bool, bool], / ) -> mtmd_helper_bitmap_wrapper: """Initialize an MTMD bitmap wrapper from a file.""" ... def mtmd_helper_bitmap_init_from_file( ctx: mtmd_context_p, fname: bytes, placeholder: Union[c_bool, bool], / ) -> Optional[mtmd_bitmap_p]: """Initialize an MTMD bitmap from a file.""" return mtmd_helper_bitmap_init_from_file_wrapper(ctx, fname, placeholder).bitmap # MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function( "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t, c_bool], mtmd_helper_bitmap_wrapper, ) def mtmd_helper_bitmap_init_from_buf_wrapper( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], length: Union[c_size_t, int], placeholder: Union[c_bool, bool], /, ) -> mtmd_helper_bitmap_wrapper: ... def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], length: Union[c_size_t, int], placeholder: Union[c_bool, bool], /, ) -> Optional[mtmd_bitmap_p]: """Initialize an MTMD bitmap from a buffer.""" return mtmd_helper_bitmap_init_from_buf_wrapper( ctx, buf, length, placeholder ).bitmap # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ... # MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); @ctypes_function( "mtmd_helper_get_n_pos", [mtmd_input_chunks_p_ctypes], llama_cpp.llama_pos, ) def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int: """Count the total positions consumed by the chunks.""" ... # MTMD_API void mtmd_helper_image_get_decoder_pos( # const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos); @ctypes_function( "mtmd_helper_image_get_decoder_pos", [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, POINTER(mtmd_decoder_pos)], None, ) def mtmd_helper_image_get_decoder_pos( image: mtmd_image_tokens_p, pos_0: llama_cpp.llama_pos, out_pos: "_Pointer[mtmd_decoder_pos]", /, ): """Fill decoder attention positions for all image embedding tokens.""" ... # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunks * chunks, # llama_pos n_past, # llama_seq_id seq_id, # int32_t n_batch, # bool logits_last, # llama_pos * new_n_past); @ctypes_function( "mtmd_helper_eval_chunks", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, mtmd_input_chunks_p_ctypes, llama_cpp.llama_pos, llama_cpp.llama_seq_id, c_int, c_bool, POINTER(llama_cpp.llama_pos), ], c_int, ) def mtmd_helper_eval_chunks( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, chunks: mtmd_input_chunks_p, n_past: llama_cpp.llama_pos, seq_id: llama_cpp.llama_seq_id, n_batch: Union[c_int, int], logits_last: Union[c_bool, bool], new_n_past: "_Pointer[llama_cpp.llama_pos]", /, ) -> int: ... # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunk * chunk, # llama_pos n_past, # llama_seq_id seq_id, # int32_t n_batch, # bool logits_last, # llama_pos * new_n_past); @ctypes_function( "mtmd_helper_eval_chunk_single", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, mtmd_input_chunk_p_ctypes, llama_cpp.llama_pos, llama_cpp.llama_seq_id, c_int, c_bool, POINTER(llama_cpp.llama_pos), ], c_int, ) def mtmd_helper_eval_chunk_single( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, chunk: mtmd_input_chunk_p, n_past: llama_cpp.llama_pos, seq_id: llama_cpp.llama_seq_id, n_batch: Union[c_int, int], logits_last: Union[c_bool, bool], new_n_past: "_Pointer[llama_cpp.llama_pos]", /, ) -> int: ... # MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunk * chunk, # float * encoded_embd, # llama_pos n_past, # llama_seq_id seq_id, # int32_t n_batch, # llama_pos * new_n_past); @ctypes_function( "mtmd_helper_decode_image_chunk", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, mtmd_input_chunk_p_ctypes, POINTER(c_float), llama_cpp.llama_pos, llama_cpp.llama_seq_id, c_int, POINTER(llama_cpp.llama_pos), ], c_int, ) def mtmd_helper_decode_image_chunk( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, chunk: mtmd_input_chunk_p, encoded_embd: CtypesArray[c_float], n_past: llama_cpp.llama_pos, seq_id: llama_cpp.llama_seq_id, n_batch: Union[c_int, int], new_n_past: "_Pointer[llama_cpp.llama_pos]", /, ) -> int: """Decode a pre-encoded image chunk.""" ... # MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void); @ctypes_function( "mtmd_helper_video_init_params_default", [], mtmd_helper_video_init_params ) def mtmd_helper_video_init_params_default() -> mtmd_helper_video_init_params: """Return the default MTMD helper video initialization parameters.""" ... # MTMD_API mtmd_helper_video * mtmd_helper_video_init( # struct mtmd_context * mctx, # const char * path, # struct mtmd_helper_video_init_params params); @ctypes_function( "mtmd_helper_video_init", [mtmd_context_p_ctypes, c_char_p, mtmd_helper_video_init_params], mtmd_helper_video_p_ctypes, ) def mtmd_helper_video_init( ctx: mtmd_context_p, path: bytes, params: mtmd_helper_video_init_params, /, ) -> Optional[mtmd_helper_video_p]: """Initialize an MTMD helper video stream from a file path.""" ... # MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf( # struct mtmd_context * mctx, # const unsigned char * buf, size_t len, # struct mtmd_helper_video_init_params params); @ctypes_function( "mtmd_helper_video_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t, mtmd_helper_video_init_params], mtmd_helper_video_p_ctypes, ) def mtmd_helper_video_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], length: Union[c_size_t, int], params: mtmd_helper_video_init_params, /, ) -> Optional[mtmd_helper_video_p]: """Initialize an MTMD helper video stream from a buffer.""" ... # MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx); @ctypes_function("mtmd_helper_video_free", [mtmd_helper_video_p_ctypes], None) def mtmd_helper_video_free(ctx: mtmd_helper_video_p, /): """Free an MTMD helper video stream.""" ... # MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx); @ctypes_function( "mtmd_helper_video_get_info", [mtmd_helper_video_p_ctypes], mtmd_helper_video_info, ) def mtmd_helper_video_get_info(ctx: mtmd_helper_video_p, /) -> mtmd_helper_video_info: """Get metadata for an MTMD helper video stream.""" ... # MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, # mtmd_bitmap ** out_bitmap, # char ** out_text); @ctypes_function( "mtmd_helper_video_read_next", [ mtmd_helper_video_p_ctypes, POINTER(mtmd_bitmap_p_ctypes), POINTER(c_char_p), ], c_int, ) def mtmd_helper_video_read_next( ctx: mtmd_helper_video_p, out_bitmap: "_Pointer[mtmd_bitmap_p_ctypes]", out_text: "_Pointer[c_char_p]", /, ) -> int: """Read the next bitmap or text chunk from an MTMD helper video stream.""" ... # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "mtmd_log_set", [llama_cpp.llama_log_callback, c_void_p], None, ) def mtmd_log_set(log_callback, user_data: c_void_p, /): """Set the MTMD logging callback.""" ... # MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "mtmd_helper_log_set", [llama_cpp.llama_log_callback, c_void_p], None, ) def mtmd_helper_log_set(log_callback, user_data: c_void_p, /): """Set the MTMD helper logging callback.""" ...