class PackedLoRALayerWeights(LoRALayerWeights):
"""LoRA used for packed layers (eg. qkv_proj)."""
def __init__(
self,
module_name: str,
rank: int,
lora_alphas: list[int | None],
lora_a: list[torch.Tensor | None],
lora_b: list[torch.Tensor | None],
scaling: list[float] | None = None,
) -> None:
super().__init__(
module_name=module_name,
rank=rank,
lora_alpha=0,
lora_a=lora_a,
lora_b=lora_b,
scaling=scaling, # type: ignore
)
self.lora_alphas = lora_alphas
if scaling is None:
self.scaling = [ # type: ignore
lora_alpha / self.rank # type: ignore # noqa
for lora_alpha in self.lora_alphas
]
@classmethod
def pack(
cls, loras: GenericSequence["LoRALayerWeights | None"]
) -> "PackedLoRALayerWeights":
"""Pack a list of LoRAs into a single LoRA.
If LoRA is None, it signifies that the submodule does not have a LoRA.
"""
first_lora = next(lora for lora in loras if lora is not None)
for lora in loras:
if lora is None:
continue
lora.optimize()
rank = first_lora.rank
module_name = first_lora.module_name
obj = cls(
module_name,
rank,
[lora.lora_alpha if lora is not None else None for lora in loras],
[lora.lora_a if lora is not None else None for lora in loras],
[lora.lora_b if lora is not None else None for lora in loras],
scaling=[
1 if lora is not None else None # type: ignore
for lora in loras
],
)
return obj
@classmethod
def pack_moe(
cls,
loras: GenericSequence["LoRALayerWeights | None"],
module_name: str,
is_non_gated_moe: bool = False,
) -> "PackedLoRALayerWeights":
"""Pack a list of LoRAs into a single LoRA.
If LoRA is None, it signifies that the submodule does not have a LoRA.
"""
first_lora = next(lora for lora in loras if lora is not None)
assert first_lora is not None
rank = first_lora.rank
lora_alpha = first_lora.lora_alpha
assert len(loras) % 3 == 0
w1_lora_a_lst = []
w2_lora_a_lst = []
w3_lora_a_lst = []
w1_lora_b_lst = []
w2_lora_b_lst = []
w3_lora_b_lst = []
# TODO: Consider the case where some experts don't have LoRA added.
for eid in range(len(loras) // 3):
w1_lora = loras[eid * 3]
w2_lora = loras[eid * 3 + 1]
w3_lora = loras[eid * 3 + 2]
# For non-gated MoE, w3 is not used, so we use w1's LoRA weights
# This is determined by checking the expert mapping (get_expert_mapping)
# which indicates when ckpt_up_proj_name is empty.
if w3_lora is None and is_non_gated_moe:
w3_lora = w1_lora
assert w1_lora is not None
assert w2_lora is not None
assert w3_lora is not None
w1_lora_a_lst.append(w1_lora.lora_a)
w2_lora_a_lst.append(w2_lora.lora_a)
w3_lora_a_lst.append(w3_lora.lora_a)
w1_lora_b_lst.append(w1_lora.lora_b)
w2_lora_b_lst.append(w2_lora.lora_b)
w3_lora_b_lst.append(w3_lora.lora_b)
w1_lora_a = torch.stack(w1_lora_a_lst, dim=0) # (num_experts,rank,input_size)
w2_lora_a = torch.stack(w2_lora_a_lst, dim=0)
w1_lora_b = torch.stack(w1_lora_b_lst, dim=0) # (num_experts,output_size,rank)
w2_lora_b = torch.stack(w2_lora_b_lst, dim=0)
# All w1, w2, w3 have the same scaling factor.
scaling = lora_alpha / rank
last_scaling = scaling
if is_non_gated_moe:
# For non-gated MoE, reuse w1 tensors for w3 to avoid memory waste
# w3_lora_a_lst and w3_lora_b_lst are not relevant in this case
w3_lora_a = w1_lora_a
w3_lora_b = w1_lora_b
# For non-gated MoE, avoid double-scaling by setting w3's scaling to 1.
last_scaling = 1.0
else:
w3_lora_a = torch.stack(w3_lora_a_lst, dim=0)
w3_lora_b = torch.stack(w3_lora_b_lst, dim=0)
obj = cls(
module_name,
rank,
[lora_alpha, lora_alpha, lora_alpha],
[w1_lora_a, w2_lora_a, w3_lora_a],
[w1_lora_b, w2_lora_b, w3_lora_b],
scaling=[scaling, scaling, last_scaling],
)
return obj
def optimize(self) -> "PackedLoRALayerWeights":
"""Optimize the LoRA by merging the scaling into lora_b."""
for i in range(len(self.lora_b)):
if self.scaling[i] == 1 or self.lora_b[i] is None: # type: ignore
continue
self.lora_b[i] *= self.scaling[i] # type: ignore
self.scaling[i] = 1 # type: ignore
return self
@property
def input_dim(self) -> int:
raise NotImplementedError()
@property
def output_dim(self) -> int:
raise NotImplementedError()
@property
def is_packed(self) -> bool:
return True