configuration_gemma3n.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/gemma3n/modular_gemma3n.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_gemma3n.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
  8. #
  9. #
  10. # Licensed under the Apache License, Version 2.0 (the "License");
  11. # you may not use this file except in compliance with the License.
  12. # You may obtain a copy of the License at
  13. #
  14. # http://www.apache.org/licenses/LICENSE-2.0
  15. #
  16. # Unless required by applicable law or agreed to in writing, software
  17. # distributed under the License is distributed on an "AS IS" BASIS,
  18. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. # See the License for the specific language governing permissions and
  20. # limitations under the License.
  21. from collections.abc import Sequence
  22. from typing import Any
  23. from huggingface_hub.dataclasses import strict
  24. from ...configuration_utils import PreTrainedConfig
  25. from ...utils import auto_docstring, is_timm_available, logging, requires_backends
  26. if is_timm_available():
  27. from timm.data import ImageNetInfo, infer_imagenet_subset
  28. logger = logging.get_logger(__name__)
  29. @auto_docstring(checkpoint="google/gemma-3n-E4B")
  30. @strict
  31. class Gemma3nTextConfig(PreTrainedConfig):
  32. r"""
  33. vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
  34. Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
  35. hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
  36. Dimension of the hidden representations for per-layer emebeddings.
  37. altup_active_idx (`int`, *optional*, defaults to 0):
  38. The index of the prediction from which AltUp will compute additional predictions or correct
  39. altup_coef_clip (`float`, *optional*, defaults to 120.0):
  40. The maximum amplitude of an AltUp prediction or correction coefficient weight.
  41. altup_correct_scale (`bool`, *optional*, defaults to `True`):
  42. If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
  43. altup_num_inputs (`int`, *optional*, defaults to 4):
  44. The number of predictions that AltUp should be make given the input sequence.
  45. num_kv_shared_layers (`int`, *optional*, defaults to 15):
  46. The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
  47. layers in the model "share" the KV values in that each local and global layer in this range uses the KV
  48. cache values computed for the last local or global layer, respectively, before entering this range. The
  49. value should be a multiple of the attention pattern size (see `layer_types` parameter).
  50. laurel_rank (int, *optional*, defaults to 64):
  51. The intermediate size for the linear projections in the Learned Augmented Residual Layer.
  52. activation_sparsity_pattern (Sequence[float], *optional*):
  53. The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
  54. explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are
  55. sparse with a sparsity factor of 0.95 and the rest are dense.
  56. ```python
  57. >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig
  58. >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
  59. >>> configuration = Gemma3nTextConfig()
  60. >>> # Initializing a model from the gemma3n_text-E4B style configuration
  61. >>> model = Gemma3nTextModel(configuration)
  62. >>> # Accessing the model configuration
  63. >>> configuration = model.config
  64. ```
  65. """
  66. model_type = "gemma3n_text"
  67. keys_to_ignore_at_inference = ["past_key_values"]
  68. base_model_tp_plan = {
  69. "layers.*.self_attn.q_proj": "colwise",
  70. "layers.*.self_attn.k_proj": "colwise",
  71. "layers.*.self_attn.v_proj": "colwise",
  72. "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
  73. "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
  74. "layers.*.self_attn.v_norm": "replicated_with_grad_allreduce",
  75. "layers.*.self_attn.o_proj": "rowwise",
  76. "layers.*.mlp.gate_proj": "colwise",
  77. "layers.*.mlp.up_proj": "colwise",
  78. "layers.*.mlp.down_proj": "rowwise",
  79. }
  80. base_model_pp_plan = {
  81. "embed_tokens": (["input_ids"], ["inputs_embeds"]),
  82. "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
  83. "norm": (["hidden_states"], ["hidden_states"]),
  84. }
  85. vocab_size: int = 262_400
  86. hidden_size: int = 2048
  87. intermediate_size: int | list[int] = 16_384
  88. num_hidden_layers: int = 35
  89. num_attention_heads: int = 8
  90. num_key_value_heads: int = 2
  91. head_dim: int = 256
  92. hidden_activation: str = "gelu_pytorch_tanh"
  93. max_position_embeddings: int = 32_768
  94. initializer_range: float = 0.02
  95. rms_norm_eps: float = 1e-6
  96. use_cache: bool = True
  97. pad_token_id: int | None = 0
  98. eos_token_id: int | list[int] | None = 1
  99. bos_token_id: int | None = 2
  100. tie_word_embeddings: bool = True
  101. rope_parameters: dict | None = None
  102. attention_bias: bool = False
  103. attention_dropout: int | float | None = 0.0
  104. sliding_window: int = 512
  105. layer_types: list[str] | None = None
  106. final_logit_softcapping: float = 30.0
  107. default_theta = {"global": 1_000_000.0, "local": 10_000.0}
  108. vocab_size_per_layer_input: int = 262_144
  109. hidden_size_per_layer_input: int = 256
  110. altup_active_idx: int = 0
  111. altup_coef_clip: float = 120.0
  112. altup_correct_scale: bool = True
  113. altup_num_inputs: int = 4
  114. num_kv_shared_layers: int = 15
  115. laurel_rank: int = 64
  116. activation_sparsity_pattern: float | list[float] | None = None
  117. def __post_init__(self, **kwargs):
  118. if (
  119. isinstance(self.intermediate_size, Sequence)
  120. and (intsize_len := len(self.intermediate_size)) != self.num_hidden_layers
  121. ):
  122. raise ValueError(
  123. "intermediate_size must have an explicit intermediate size for every layer or one for all layers. "
  124. f"Expected {self.num_hidden_layers} values but got {intsize_len}."
  125. )
  126. elif not isinstance(self.intermediate_size, Sequence):
  127. self.intermediate_size = [self.intermediate_size] * self.num_hidden_layers
  128. if self.layer_types is None:
  129. self.layer_types = [
  130. "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
  131. ]
  132. if self.activation_sparsity_pattern is None:
  133. num_sparse_layers = 10 if self.num_hidden_layers > 10 else 0
  134. self.activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (
  135. self.num_hidden_layers - num_sparse_layers
  136. )
  137. if (len_asp := len(self.activation_sparsity_pattern)) != self.num_hidden_layers:
  138. raise ValueError(
  139. "activation_sparsity_pattern must have an explicit activation sparsity value for every layer."
  140. f"Expected {self.num_hidden_layers} values but got {len_asp}."
  141. )
  142. super().__post_init__(**kwargs)
  143. def validate_architecture(self):
  144. """Part of `@strict`-powered validation. Validates the architecture of the config."""
  145. if self.hidden_size % self.num_attention_heads != 0:
  146. raise ValueError(
  147. f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
  148. f"heads ({self.num_attention_heads})."
  149. )
  150. def convert_rope_params_to_dict(self, **kwargs):
  151. rope_scaling = kwargs.pop("rope_scaling", None)
  152. # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
  153. # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
  154. default_rope_params = {
  155. "sliding_attention": {"rope_type": "default"},
  156. "full_attention": {"rope_type": "default"},
  157. }
  158. self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
  159. if rope_scaling is not None:
  160. self.rope_parameters["full_attention"].update(rope_scaling)
  161. # Set default values if not present
  162. if self.rope_parameters.get("full_attention") is None:
  163. self.rope_parameters["full_attention"] = {"rope_type": "default"}
  164. self.rope_parameters["full_attention"].setdefault(
  165. "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"])
  166. )
  167. if self.rope_parameters.get("sliding_attention") is None:
  168. self.rope_parameters["sliding_attention"] = {"rope_type": "default"}
  169. self.rope_parameters["sliding_attention"].setdefault(
  170. "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"])
  171. )
  172. # Standardize and validate the correctness of rotary position embeddings parameters
  173. self.standardize_rope_params()
  174. return kwargs
  175. @auto_docstring(checkpoint="google/gemma-3n-E4B")
  176. @strict
  177. class Gemma3nAudioConfig(PreTrainedConfig):
  178. r"""
  179. vocab_offset (`int`, *optional*, defaults to 262272):
  180. Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
  181. 0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
  182. input_feat_size (`int`, *optional*, defaults to 128):
  183. The number of channels in each mel-spectrogram frame.
  184. gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
  185. Clipping value used to stabilize extremely large gradient values.
  186. conf_attention_chunk_size (`int`, *optional*, defaults to 12):
  187. The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
  188. Universal Speech Model.
  189. conf_attention_context_left (`int`, *optional*, defaults to 13):
  190. The left context size of the local attention inside the Conformer ("conf") section of the
  191. Universal Speech Model.
  192. conf_attention_context_right (`int`, *optional*, defaults to 0):
  193. The right context size of the local attention inside the Conformer ("conf") section of the
  194. Universal Speech Model.
  195. conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
  196. Logit cap applied during local attention inside the Conformer ("conf") section of the
  197. Universal Speech Model.
  198. conf_num_attention_heads (`int`, *optional*, defaults to 8):
  199. The number of attention heads in local attention inside the Conformer ("conf") section of the
  200. Universal Speech Model.
  201. conf_num_hidden_layers (`int`, *optional*, defaults to 12):
  202. The number of layers that use local attention inside the Conformer ("conf") section of the
  203. Universal Speech Model.
  204. conf_conv_kernel_size (`int`, *optional*, defaults to 5):
  205. Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
  206. Universal Speech Model.
  207. conf_reduction_factor (`int`, *optional*, defaults to 4):
  208. Reduction factor used in the conformer block inside the Conformer ("conf") section of the
  209. Universal Speech Model.
  210. conf_residual_weight (`float`, *optional*, defaults to 0.5):
  211. Residual connection weight inside the Conformer ("conf") section of the
  212. Universal Speech Model.
  213. sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
  214. The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
  215. ("sscp") section of the Universal Speech Model.
  216. sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
  217. Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
  218. Projection ("sscp") section of the Universal Speech Model.
  219. sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
  220. Kernel sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
  221. Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
  222. tuple of height and width for each layer, where the height corresponds to the time dimension and the width
  223. corresponds to the frequency dimension.
  224. sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
  225. Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
  226. Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
  227. tuple of height and width for each layer, where the height corresponds to the time dimension and the width
  228. corresponds to the frequency dimension.
  229. Example:
  230. ```python
  231. >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder
  232. >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
  233. >>> configuration = Gemma3nAudioConfig()
  234. >>> # Initializing a model from the gemma3n_audio-E4B style configuration
  235. >>> model = Gemma3nAudioEncoder(configuration)
  236. >>> # Accessing the model configuration
  237. >>> configuration = model.config
  238. ```
  239. """
  240. model_type = "gemma3n_audio"
  241. vocab_size: int = 128
  242. vocab_offset: int = 262_144 + 128 # text vocab size + vision vocab size
  243. input_feat_size: int = 128
  244. hidden_size: int = 1536
  245. rms_norm_eps: float = 1e-6
  246. gradient_clipping: float = 10_000_000_000.0
  247. conf_attention_chunk_size: int = 12
  248. conf_attention_context_left: int = 13
  249. conf_attention_context_right: int = 0
  250. conf_attention_logit_cap: float = 50.0
  251. conf_num_attention_heads: int = 8
  252. conf_num_hidden_layers: int = 12
  253. conf_conv_kernel_size: int = 5
  254. conf_reduction_factor: int = 4
  255. conf_residual_weight: float = 0.5
  256. sscp_conv_channel_size: list[int] | tuple[int, int] = (128, 32)
  257. sscp_conv_group_norm_eps: float = 1e-3
  258. sscp_conv_kernel_size: list | tuple[tuple[int, int], tuple[int, int]] = (
  259. (3, 3),
  260. (3, 3),
  261. )
  262. sscp_conv_stride_size: list | tuple[tuple[int, int], tuple[int, int]] = (
  263. (2, 2),
  264. (2, 2),
  265. )
  266. @auto_docstring(checkpoint="google/gemma-3n-E4B")
  267. @strict
  268. class Gemma3nVisionConfig(PreTrainedConfig):
  269. r"""
  270. architecture (`str`, *optional*, defaults to `"resnet50"`):
  271. The timm architecture to load.
  272. do_pooling (`bool`, *optional*, defaults to `True`):
  273. Whether to do pooling for the last_hidden_state in `TimmWrapperModel` or not.
  274. model_args (`dict[str, Any]`, *optional*):
  275. Additional keyword arguments to pass to the `timm.create_model` function. e.g. `model_args={"depth": 3}`
  276. for `timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k` to create a model with 3 blocks. Defaults to `None`.
  277. vocab_offset (`int`, *optional*, defaults to 262144):
  278. Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
  279. 0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
  280. Example:
  281. ```python
  282. >>> from transformers import Gemma3nVisionConfig, TimmWrapper
  283. >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
  284. >>> configuration = Gemma3nVisionConfig()
  285. >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
  286. >>> model = TimmWrapper(configuration)
  287. >>> # Accessing the model configuration
  288. >>> configuration = model.config
  289. ```
  290. """
  291. model_type = "gemma3n_vision"
  292. architecture: str = "mobilenetv5_300m_enc"
  293. initializer_range: float = 0.02
  294. do_pooling: bool = False
  295. model_args: dict | None = None
  296. hidden_size: int = 2048
  297. vocab_size: int = 128
  298. vocab_offset: int = 262_144
  299. rms_norm_eps: float = 1e-06
  300. @classmethod
  301. def from_dict(cls, config_dict: dict[str, Any], **kwargs):
  302. # Create a copy to avoid mutating the original dict
  303. config_dict = config_dict.copy()
  304. label_names = config_dict.get("label_names")
  305. is_custom_model = "num_labels" in kwargs or "id2label" in kwargs
  306. # if no labels added to config, use imagenet labeller in timm
  307. if label_names is None and not is_custom_model:
  308. requires_backends(cls, ["timm"])
  309. imagenet_subset = infer_imagenet_subset(config_dict)
  310. if imagenet_subset:
  311. dataset_info = ImageNetInfo(imagenet_subset)
  312. synsets = dataset_info.label_names()
  313. label_descriptions = dataset_info.label_descriptions(as_dict=True)
  314. label_names = [label_descriptions[synset] for synset in synsets]
  315. if label_names is not None and not is_custom_model:
  316. kwargs["id2label"] = dict(enumerate(label_names))
  317. # if all label names are unique, create label2id mapping as well
  318. if len(set(label_names)) == len(label_names):
  319. kwargs["label2id"] = {name: i for i, name in enumerate(label_names)}
  320. else:
  321. kwargs["label2id"] = None
  322. # timm config stores the `num_classes` attribute in both the root of config and in the "pretrained_cfg" dict.
  323. # We are removing these attributes in order to have the native `transformers` num_labels attribute in config
  324. # and to avoid duplicate attributes
  325. num_labels_in_kwargs = kwargs.pop("num_labels", None)
  326. num_labels_in_dict = config_dict.pop("num_classes", None)
  327. # passed num_labels has priority over num_classes in config_dict
  328. kwargs["num_labels"] = num_labels_in_kwargs or num_labels_in_dict
  329. # pop num_classes from "pretrained_cfg",
  330. # it is not necessary to have it, only root one is used in timm
  331. if "pretrained_cfg" in config_dict and "num_classes" in config_dict["pretrained_cfg"]:
  332. config_dict["pretrained_cfg"].pop("num_classes", None)
  333. return super().from_dict(config_dict, **kwargs)
  334. def to_dict(self) -> dict[str, Any]:
  335. output = super().to_dict()
  336. output.setdefault("num_classes", self.num_labels)
  337. output.setdefault("label_names", list(self.id2label.values()))
  338. output.pop("id2label", None)
  339. output.pop("label2id", None)
  340. return output
  341. @auto_docstring(checkpoint="google/gemma-3n-E4B")
  342. @strict
  343. class Gemma3nConfig(PreTrainedConfig):
  344. r"""
  345. audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
  346. The number of soft tokens per audio clip.
  347. vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
  348. The number of soft tokens per image.
  349. boi_token_id (`int`, *optional*, defaults to 255999):
  350. The begin-of-image token index to wrap the image prompt.
  351. eoi_token_id (`int`, *optional*, defaults to 262144):
  352. The end-of-image token index to wrap the image prompt.
  353. boa_token_id (`int`, *optional*, defaults to 256000):
  354. The begin-of-audio token index to wrap the audio prompt.
  355. eoa_token_id (`int`, *optional*, defaults to 262272):
  356. The end-of-audio token index to wrap the audio prompt.
  357. Example:
  358. ```python
  359. >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig
  360. >>> # Initializing a MobileNet vision config, which is loaded from TIMM
  361. >>> vision_config = Gemma3nVisionConfig()
  362. >>> # Initializing a Gemma3n Audio config
  363. >>> audio_config = Gemma3nAudioConfig()
  364. >>> # Initializing a Gemma3n Text config
  365. >>> text_config = Gemma3nTextConfig()
  366. >>> # Initializing a Gemma3n gemma-3-4b style configuration
  367. >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)
  368. >>> # Initializing a model from the gemma-3-4b style configuration
  369. >>> model = Gemma3nTextConfig(configuration)
  370. >>> # Accessing the model configuration
  371. >>> configuration = model.config
  372. ```"""
  373. model_type = "gemma3n"
  374. sub_configs = {
  375. "text_config": Gemma3nTextConfig,
  376. "vision_config": Gemma3nVisionConfig,
  377. "audio_config": Gemma3nAudioConfig,
  378. }
  379. text_config: Gemma3nTextConfig | dict[str, Any] | None = None
  380. vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None
  381. audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None
  382. audio_soft_tokens_per_image: int | None = 188
  383. vision_soft_tokens_per_image: int | None = 256
  384. boi_token_id: int | None = 255_999
  385. eoi_token_id: int | None = 262_144
  386. image_token_id: int | None = 262_145
  387. boa_token_id: int | None = 256_000
  388. eoa_token_id: int | None = 262_272
  389. audio_token_id: int | None = 262_273
  390. initializer_range: float | None = 0.02
  391. tie_word_embeddings: bool | None = True
  392. def __post_init__(self, **kwargs):
  393. if self.text_config is None:
  394. self.text_config = Gemma3nTextConfig()
  395. logger.info("text_config is None, using default Gemma3nTextConfig text config.")
  396. elif isinstance(self.text_config, dict):
  397. self.text_config = Gemma3nTextConfig(**self.text_config)
  398. if isinstance(self.vision_config, dict):
  399. self.vision_config = Gemma3nVisionConfig(**self.vision_config)
  400. elif self.vision_config is None:
  401. self.vision_config = Gemma3nVisionConfig()
  402. logger.info("vision_config is None, using default Gemma3nVisionConfig vision config.")
  403. if isinstance(self.audio_config, dict):
  404. self.audio_config = Gemma3nAudioConfig(**self.audio_config)
  405. elif self.audio_config is None:
  406. self.audio_config = Gemma3nAudioConfig()
  407. logger.info("audio_config is None. Using default Gemma3nAudioConfig.")
  408. super().__post_init__(**kwargs)
  409. __all__ = ["Gemma3nAudioConfig", "Gemma3nConfig", "Gemma3nTextConfig", "Gemma3nVisionConfig"]