Structure and Enumeration Types

Sampling Parameters

typedef struct common_params_sampling_s { int32_t top_k; // The top_k for sampling: a value ≤ 0 indicates using the entire vocabulary. float top_p; // The top_p sampling: a value of 1.0 means it is disabled. float min_p; // Minimum probability threshold: a value of 0.0 means it is disabled. float temp; // Temperature coefficient: greedy sampling is used when the value is ≤ 0.0. float typ_p; // The typical_p sampling: a value of 1.0 means it is disabled. int32_t min_keep; // Minimum number of tokens to retain. int32_t penalty_last_n; // Control the scope of history check: The repetition penalty only applies to the most recent n tokens. float penalty_repeat; // Repetition penalty coefficient: the larger the value, the less repetition there will be. float penalty_freq; // Frequency penalty coefficient: the larger the value, the less repetition there will be. float penalty_present; // Presence penalty coefficient: a penalty is applied once for any token that has appeared. } common_params_sampling_t;
  • Parameter settings for the model output sampling algorithm.

Model Types

typedef enum xlm_model_type_e { XLM_MODEL_TYPE_INTERNVL = 0, // 0: internvl (not support yet) XLM_MODEL_TYPE_DEEPSEEK = 1, // 1: deepseek XLM_MODEL_TYPE_QWEN = 2, // 2: qwen (not support yet) XLM_MODEL_TYPE_LLAMA = 3, // 3: llama (not support yet) XLM_MODEL_TYPE_INTERNLM = 4, // 4: internlm XLM_MODEL_TYPE_OMNI = 5, // 5: omni XLM_MODEL_TYPE_QWEN_VL = 6, // 6: qwen-vl (not support yet) XLM_MODEL_TYPE_QWEN2_5 = 7, // 7: qwen2.5 } xlm_model_type;
  • Specify the model type. Currently, deepseek, internlm, omni and qwen2.5 are supported.

General Parameters

typedef struct xlm_common_params_s { const char *model_path; // Model file path const char *omni_visual_model_path; // Omni visual model file path const char *omni_audio_model_path; // Omni audio model file path const char *omni_text_model_path; // Omni text model file path bool omni_online_mode; // Whether Omni is in online mode const char *embed_tokens; // Omni embed_tokens path const char *token_config_path; // Tokenizer configuration path const char *config_path; // Path to other configuration files bool k_cache_int8; // Whether to use k-int8 quantization xlm_model_type model_type; // Model type int32_t context_size; // Context Length common_params_sampling_t sampling; // Sampling Parameter char *prompt_file; // External prompt file name char *path_prompt_cache; // Prompt cache file path } xlm_common_params_t;
  • Configuration of model general parameters. The default parameters can be obtained via the xlm_create_default_param interface.

Input Types

typedef enum xlm_input_type_e { XLM_INPUT_PROMPT = 0, // Plain text prompt input XLM_INPUT_TOKEN = 1, // Token ID input (not supported yet). XLM_INPUT_MULTI_MODAL = 2 // Multimodal Input } xlm_input_type;
  • The input type of the request.

Inference Backend Type

typedef enum xlm_infer_backend_e { XLM_INFER_BACKEND_ANY = 0, // Arbitrary Kernel XLM_INFER_BACKEND_BPU_ANY = 1, // Arbitrary BPU core XLM_INFER_BACKEND_BPU_0 = 2, // BPU core 0 XLM_INFER_BACKEND_BPU_1 = 3, // BPU core 1 XLM_INFER_BACKEND_BPU_2 = 4, // BPU core 2 XLM_INFER_BACKEND_BPU_3 = 5 // BPU core 3 } xlm_infer_backend;
  • Inference backend type setting, which supports the BPU core binding function.

Image Preprocessing type

typedef enum xlm_img_preprocess_type_e { XLM_IMG_PREPROCESS_DYNAMIC = 0, // Dynamic resolution (default). XLM_IMG_PREPROCESS_NONE = 1 // No preprocessing } xlm_img_preprocess_type;
  • Image preprocessing type setting. Currently, only dynamic resolution is supported.

Token Input Structure

typedef struct xlm_input_token_s { int32_t *tokens; // Token ID array int32_t tokens_size; // Token count } xlm_input_token_t;
  • The structure of the token input type.

Image Input Structure

typedef struct xlm_input_image_s { const char *image_path; // Image path (either one). const uint8_t *image_data; // Image data (either one). int32_t image_width; // Image width int32_t image_height; // Image height xlm_img_preprocess_type image_preprocess; // Preprocessing method } xlm_input_image_t; - The structure of the token input type. ## Image Input Structure ```cpp typedef struct xlm_input_image_s { const char *image_path; // Image path (either one). const uint8_t *image_data; // Image data (either one). int32_t image_width; // Image width int32_t image_height; // Image height xlm_img_preprocess_type image_preprocess; // Preprocessing method } xlm_input_image_t;
  • The image structure in multimodal input.

Multimodal Input Structure

typedef struct xlm_input_multi_modal_s { const char *prompt; // Text prompt int32_t image_num; // Image count xlm_input_image_t *images; // Image array } xlm_input_multi_modal_t;
  • The structure in multimodal input.

    • Supports single prompt and single image.

    • Supports single prompt with multiple images. (not support yet)

Omni Video Input Structure

typedef struct omni_online_video_s { uint8_t *y_ptr; // Y component address of NV12 uint8_t *uv_ptr; // UV component address of NV12 int32_t width; // Width of NV12 int32_t height; // Height of NV12 } omni_online_video_t;
  • Structure of online video input parameters for the Omni model.

Omni Audio Input Structure

typedef struct omni_online_audio_s { const float *data; // Audio data start address int32_t data_size; // Audio data length } omni_online_audio_t;
  • Structure of online audio input parameters for the Omni model.

Omni Text Input Structure

typedef struct omni_online_text_s { const char *system_text; // System text content const char *user_text; // User text content } omni_online_text_t;
  • Structure of online text input parameters for the Omni model.

Priority Type

typedef enum xlm_priority_type_e { XLM_PRIORITY_TYPE_NORMAL = 0, // Normal XLM_PRIORITY_TYPE_HIGH = 1, // High XLM_PRIORITY_TYPE_URGENT = 2 // Urgent } xlm_priority_type_t; typedef struct xlm_priority_s { xlm_priority_type_t type; // Priority type int32_t priority; // 0~253 Only valid when NORMAL, 0~253 } xlm_priority_t;
  • Model input request priority settings.

    • The preemption relationships are as follows:

      • XLM_PRIORITY_TYPE_URGENT --preemption--> XLM_PRIORITY_TYPE_HIGH --preemption--> XLM_PRIORITY_TYPE_NORMAL.
    • When the priority is both NORMAL, preemption will not occur. Instead, the execution order will be determined based on the value of priority.

      • A higher priority value indicates a higher priority. The value range is [0, 253].

PPL Parameter Structure

typedef struct xlm_ppl_s { bool load_ckpt; // Whether to enable the breakpoint resumption function for testing int32_t text_data_num; // Truncating text to a specific length int32_t max_length; // The sequence length fed into the model each time int32_t stride; // Test step size const char *testcase_name; // The filename of the test case const char *hbm_path; // The path of the test model } xlm_ppl_t;
  • The parameter structure for the model's PPL, and currently only supports the PPL of the InternVL model. */}

Single Inference Request Structure

typedef struct xlm_lm_request_s { int32_t request_id; // Request id, corresponds one-to-one with the result returned xlm_input_type type; // Input type bool new_chat; // Whether it is a new conversation const char *prompt_json; // Omni json input union { const char *prompt; xlm_input_token_t token; xlm_input_multi_modal_t multi_modal_requset; }; const char *system_prompt; const char *chat_template; xlm_infer_backend infer_backend; // Inference backend xlm_priority_t priority; // Priority xlm_ppl_t *ppl; // PPL parameters } xlm_lm_request_t;
  • Single inference request structure. (In the future, it will support multiple requests at the same time.)

Inference Input Structure

typedef struct xlm_input_s { int32_t request_num; // Request count xlm_lm_request_t *requests; // Request array } xlm_input_t;
  • Model inference input structure, which is passed as a parameter to the inference interface.

Performance Data Structure

typedef struct xlm_model_performance_s { double vit_cost; // vit cost time in ms int64_t prefill_token_num; // prefill token count double prefill_tps; // prefill speed in tokens/s int64_t decode_token_num; // decode token count double decode_tps; // decode speed in tokens/s double ttft; // time to first token double tpot; // time per output token } xlm_model_performance_t;
  • Model performance data structure. At the end of inference, it will return the performance data of this inference.

Inference Result Structure

typedef struct xlm_result_s { char *text; // Inference result text int32_t request_id; // Corresponding request ID xlm_model_performance_t performance; // Model performance data } xlm_result_t;
  • The structure for returning inference result.

Interface Status

typedef enum xlm_state_e { XLM_STATE_START = 0, // Start XLM_STATE_END = 1, // End XLM_STATE_RUNNING = 2, // Running XLM_STATE_ERROR = 3 // Error } xlm_state_t;
  • The current inference status of the model is returned along with the inference results.