Storing large items in DynamoDB

2023-02-06 · Thomas Taylor

Each item in a DynamoDB table has a maximum size limit of 400 KB, including both the attribute names and values. This limit applies to all data types: strings, numbers, and binary data.

The three best ways to mitigate the maximum size limit:

Partitioning the data
Compressing the data
Storing data in S3

How to handle large data in DynamoDB

Partition the data

A simple way to get around the item size limit is to split the data into multiple items.

Table Name: lorem

pk	sk	data
lorem	p#0	Lorem ipsum dolor sit…
lorem	p#1	Euismod nisi porta lo…
lorem	p#2	rcu risus quis varius…
lorem	p#3	phasellus. Enim praes…

 1import boto3
 2
 3def partition_data(data, size):
 4    return [data[i:i+size] for i in range(0, len(data), size)]
 5
 6
 7# 100 paragraphs of Lorem ipsum
 8lorem = """
 9Lorem ipsum dolor sit amet, consectetur adipiscing elit,
10sed do eiusmod tempor incididunt ut labore et dolore magna 
11aliqua. Sem integer vitae justo eget magna. At tellus at 
12urna condimentum mattis pellentesque id. Habitasse...
13"""
14
15dynamodb = boto3.resource("dynamodb")
16table = dynamodb.Table("lorem")
17partition_key = "lorem"
18sort_key_prefix = "p#" # p for partition
19
20# Write chunks to DynamoDB
21chunks = partition_data(lorem, 5000)
22for i, c in enumerate(chunks):
23    table.put_item(
24        Item={
25            "pk": partition_key,
26            "sk": f"{sort_key_prefix}{i}",
27            "data": c
28        }
29    )
30
31# Read chunks from DynamoDB
32response = table.query(
33    KeyConditionExpression="pk = :pk and begins_with(sk, :sk)",
34    ExpressionAttributeValues={
35        ":pk": partition_key,
36        ":sk": sort_key_prefix
37    },
38    ScanIndexForward=True
39)
40
41# Query for all paginated results if applicable.
42items = response["Items"]
43while "LastEvaluatedKey" in response:
44    response = table.query(ExclusiveStartKey=response["LastEvaluatedKey"])
45    items.update(response["Items"])
46
47# Concatenate the data field from all the items
48lorem_from_dynamodb = "".join(i["data"] for i in items)
49
50print(lorem == lorem_from_dynamodb) # prints True

Compress the data

Try to reduce the size of your data by compression. Compression algorithms like Gzip can significantly reduce the size of the data.

Compressing a single item

Table Name: lorem

pk	data
lorem	eJy1Xdly3LquffdX6ANu+…
lorem	eJytXFl227gS/c8quALtw…

 1import boto3
 2import zlib
 3
 4
 5def compress_data(data):
 6    return zlib.compress(data.encode())
 7
 8
 9# 100 paragraphs of Lorem ipsum
10lorem = """
11Lorem ipsum dolor sit amet, consectetur adipiscing elit,
12sed do eiusmod tempor incididunt ut labore et dolore magna 
13aliqua. Sem integer vitae justo eget magna. At tellus at 
14urna condimentum mattis pellentesque id. Habitasse...
15"""
16
17dynamodb = boto3.resource("dynamodb")
18table = dynamodb.Table("lorem")
19partition_key = "lorem"
20sort_key = "lorem"
21
22table.put_item(Item={"pk": partition_key, "data": compress_data(lorem)})
23
24response = table.get_item(Key={"pk": partition_key})
25data = response["Item"]["data"]
26
27lorem_from_dynamodb = zlib.decompress(bytes(data)).decode()
28print(lorem_from_dynamodb == lorem)  # prints true

Compressing a partitioned item

DynamoDB schema:

Table Name: lorem

pk	sk	data
lorem	p#0	eJy1Xdly3LquffdX6ANu+…
lorem	p#1	eJytXFl227gS/c8quALtw…

 1import boto3
 2import zlib
 3
 4
 5def partition_data(data, size):
 6    return [data[i : i + size] for i in range(0, len(data), size)]
 7
 8
 9def compress_data(data):
10    return zlib.compress(data.encode())
11
12
13# 100 paragraphs of Lorem ipsum
14lorem = """
15Lorem ipsum dolor sit amet, consectetur adipiscing elit,
16sed do eiusmod tempor incididunt ut labore et dolore magna 
17aliqua. Sem integer vitae justo eget magna. At tellus at 
18urna condimentum mattis pellentesque id. Habitasse...
19"""
20
21dynamodb = boto3.resource("dynamodb")
22table = dynamodb.Table("lorem")
23partition_key = "lorem"
24sort_key_prefix = "p#"  # p for partition
25
26# Write chunks to DynamoDB
27chunks = partition_data(lorem, 50000)
28for i, c in enumerate(chunks):
29    table.put_item(
30        Item={
31            "pk": partition_key,
32            "sk": f"{sort_key_prefix}{i}",
33            "data": compress_data(c),
34        }
35    )
36
37# Read chunks from DynamoDB
38response = table.query(
39    KeyConditionExpression="pk = :pk and begins_with(sk, :sk)",
40    ExpressionAttributeValues={":pk": partition_key, ":sk": sort_key_prefix},
41    ScanIndexForward=True,
42)
43
44# Query for all paginated results if applicable.
45items = response["Items"]
46while "LastEvaluatedKey" in response:
47    response = table.query(ExclusiveStartKey=response["LastEvaluatedKey"])
48    items.update(response["Items"])
49
50# Concatenate the data field from all the items
51lorem_from_dynamodb = "".join(
52    zlib.decompress(bytes(i["data"])).decode() for i in items
53)
54
55print(lorem_from_dynamodb == lorem) # prints true

Store the data in S3

Consider storing the data in S3 as opposed to an attribute value in DynamoDB.

Table Name: lorem

pk	s3_key
lorem	s3://bucket/key

 1import boto3
 2
 3# 100 paragraphs of Lorem ipsum
 4lorem = """
 5Lorem ipsum dolor sit amet, consectetur adipiscing elit,
 6sed do eiusmod tempor incididunt ut labore et dolore magna 
 7aliqua. Sem integer vitae justo eget magna. At tellus at 
 8urna condimentum mattis pellentesque id. Habitasse...
 9"""
10
11bucket_name = "bucket_name"
12object_key = "object_key"
13partition_key = "lorem"
14s3_key = f"s3://{bucket_name}/{object_key}"
15
16# Store data in S3 object
17s3 = boto3.client("s3")
18s3.put_object(Bucket=bucket_name, Key=object_key, Body=lorem.encode())
19
20# Store reference to S3 object in DynamoDB
21dynamodb = boto3.resource("dynamodb")
22table = dynamodb.Table("lorem")
23table.put_item(Item={"pk": partition_key, "s3_key": s3_key})
24
25# Get reference to S3 object in DynamoDB
26response = table.get_item(Key={"pk": partition_key})
27s3_key = response["Item"]["s3_key"]
28
29# Read contents of S3 object
30bucket, key = s3_key[5:].split("/")  # remove "s3://" prefix and split on "/"
31response = s3.get_object(Bucket=bucket, Key=key)
32lorem_from_s3 = response["Body"].read().decode()
33
34print(lorem_from_s3 == lorem) # prints True

#Python #Aws #Dynamodb #Serverless

Reply to this post by email ↪