= FrozenOpenCLIPEmbedder() a
Frozen OpenCLIP
CLIP model
FrozenOpenCLIPEmbedder_config
FrozenOpenCLIPEmbedder_config (arch:str, version:str, device:str, max_length:int, freeze:bool, layer:str)
FrozenOpenCLIPEmbedder
FrozenOpenCLIPEmbedder (arch='ViT-H-14', version='laion2b_s32b_b79k', device='cpu', max_length=77, freeze=True, layer='penultimate')
Loads and freezes the OpenCLIP transformer encoder for text prompts.
="[1, 2, 2]"
p a.tokenize_and_push_to_device(p)
tensor([[49406, 314, 272, 267, 273, 267, 273, 316, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
"").shape a.tokenize_and_push_to_device(
torch.Size([1, 77])
"1,1,2", "2,2,2"]).shape a.tokenize_and_push_to_device([
torch.Size([2, 77])
a.model.attn_mask.shape
torch.Size([77, 77])
= a.tokenize_and_push_to_device(["1,1,2", "2,2,2"])
c = a(c)
enc enc.shape, enc
(torch.Size([2, 77, 1024]),
tensor([[[-0.3134, -0.4476, -0.0082, ..., 0.2542, -0.0324, -0.2960],
[ 0.0668, -1.2381, 0.9908, ..., 0.1785, 0.1592, -0.4320],
[ 0.6988, -0.2168, -1.2912, ..., 2.1063, -0.0302, -0.5666],
...,
[ 0.4703, -1.4072, -0.4847, ..., -0.1257, -0.1650, 0.1206],
[ 0.5117, -1.3949, -0.4672, ..., -0.4288, -0.2166, 0.2904],
[ 0.1480, -2.1998, -1.1187, ..., 0.0823, -0.4157, 0.6237]],
[[-0.3134, -0.4476, -0.0082, ..., 0.2542, -0.0324, -0.2960],
[-0.1180, -1.6322, 1.2987, ..., -0.1378, -0.1529, -0.3377],
[-0.7251, -0.8167, -0.9966, ..., 2.2262, -0.2325, -0.0138],
...,
[ 0.3887, -1.3395, -0.5868, ..., -0.1621, -0.0594, 0.1253],
[ 0.4360, -1.3350, -0.5684, ..., -0.4643, -0.1131, 0.2847],
[ 0.1691, -2.1725, -1.1441, ..., 0.0633, -0.3175, 0.7041]]]))
c
tensor([[49406, 272, 267, 272, 267, 273, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[49406, 273, 267, 273, 267, 273, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
1].tolist()) a.tokenizer.decode(c[
'<start_of_text>2 , 2 , 2 <end_of_text>!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
1]) open_clip.decode(c[
'<start_of_text>2 , 2 , 2 <end_of_text>!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
Cached model
Model takes now also (batched) scalar int values that are defined to unique conditions like \([1,2,2]=4\). If input is now such int the output is the cached pre-embedded tensor. If a non int, like a token string is passed we just do the normal embedding live.
CachedFrozenOpenCLIPEmbedder
CachedFrozenOpenCLIPEmbedder (arch='ViT-H-14', version='laion2b_s32b_b79k', device='cpu', max_length=77, freeze=True, layer='penultimate')
Adds caching support to FrozenOpenCLIPEmbedder
.
= CachedFrozenOpenCLIPEmbedder()
a = ["1,1,2", "2,2,2"]
p
a.generate_cache(p)
[INFO]: caching trying to allocate memory (2, 77, 1024) on cpu, approx. 0.001 GB
= torch.tensor([0,0,1], device=a.device)
c_cached = a.tokenize_and_push_to_device(["1,1,2", "1,1,2", "2,2,2"])
c_uncached
= a(c_cached)
enc_cached = a(c_uncached)
enc_uncached
=1e-5) enc_cached.shape, enc_uncached.shape, torch.allclose(enc_cached, enc_uncached, atol
(torch.Size([3, 77, 1024]), torch.Size([3, 77, 1024]), True)