diff --git a/.gitignore b/.gitignore
index fe90f73..978379e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ tsconfig.tsbuildinfo
.quartz-cache
# Custom
-TODO.md
\ No newline at end of file
+TODO.md
+copy.sh
diff --git a/content/attachments/Pasted image 20240325030634.png b/content/attachments/Pasted image 20240325030634.png
new file mode 100644
index 0000000..af8b7e7
Binary files /dev/null and b/content/attachments/Pasted image 20240325030634.png differ
diff --git a/content/attachments/Pasted image 20240325033014.png b/content/attachments/Pasted image 20240325033014.png
new file mode 100644
index 0000000..b16fb24
Binary files /dev/null and b/content/attachments/Pasted image 20240325033014.png differ
diff --git a/content/attachments/Pasted image 20240325175452.png b/content/attachments/Pasted image 20240325175452.png
new file mode 100644
index 0000000..1f1c2ee
Binary files /dev/null and b/content/attachments/Pasted image 20240325175452.png differ
diff --git a/content/attachments/Pasted image 20240325181518.png b/content/attachments/Pasted image 20240325181518.png
new file mode 100644
index 0000000..2c8db7e
Binary files /dev/null and b/content/attachments/Pasted image 20240325181518.png differ
diff --git a/content/attachments/Pasted image 20240523233204.png b/content/attachments/Pasted image 20240523233204.png
new file mode 100644
index 0000000..d458ded
Binary files /dev/null and b/content/attachments/Pasted image 20240523233204.png differ
diff --git a/content/attachments/Pasted image 20240524001339.png b/content/attachments/Pasted image 20240524001339.png
new file mode 100644
index 0000000..91f5266
Binary files /dev/null and b/content/attachments/Pasted image 20240524001339.png differ
diff --git a/content/attachments/Pasted image 20240525004430.png b/content/attachments/Pasted image 20240525004430.png
new file mode 100644
index 0000000..e28aa1c
Binary files /dev/null and b/content/attachments/Pasted image 20240525004430.png differ
diff --git a/content/attachments/Pasted image 20240605021720.png b/content/attachments/Pasted image 20240605021720.png
new file mode 100644
index 0000000..faca4a3
Binary files /dev/null and b/content/attachments/Pasted image 20240605021720.png differ
diff --git a/content/attachments/Pasted image 20240607164228.png b/content/attachments/Pasted image 20240607164228.png
new file mode 100644
index 0000000..58ffedb
Binary files /dev/null and b/content/attachments/Pasted image 20240607164228.png differ
diff --git a/content/attachments/Pasted image 20240608170843.png b/content/attachments/Pasted image 20240608170843.png
new file mode 100644
index 0000000..b394d85
Binary files /dev/null and b/content/attachments/Pasted image 20240608170843.png differ
diff --git a/content/attachments/Pasted image 20240611145858.png b/content/attachments/Pasted image 20240611145858.png
new file mode 100644
index 0000000..415a6b9
Binary files /dev/null and b/content/attachments/Pasted image 20240611145858.png differ
diff --git a/content/attachments/Pasted image 20240611155836.png b/content/attachments/Pasted image 20240611155836.png
new file mode 100644
index 0000000..fb4e4bd
Binary files /dev/null and b/content/attachments/Pasted image 20240611155836.png differ
diff --git a/content/attachments/Pasted image 20240611161319.png b/content/attachments/Pasted image 20240611161319.png
new file mode 100644
index 0000000..536192b
Binary files /dev/null and b/content/attachments/Pasted image 20240611161319.png differ
diff --git a/content/attachments/Pasted image 20240611162107.png b/content/attachments/Pasted image 20240611162107.png
new file mode 100644
index 0000000..aacd40d
Binary files /dev/null and b/content/attachments/Pasted image 20240611162107.png differ
diff --git a/content/attachments/Pasted image 20240623185831.png b/content/attachments/Pasted image 20240623185831.png
new file mode 100644
index 0000000..f2db4f3
Binary files /dev/null and b/content/attachments/Pasted image 20240623185831.png differ
diff --git a/content/attachments/Pasted image 20240623185850.png b/content/attachments/Pasted image 20240623185850.png
new file mode 100644
index 0000000..55684bf
Binary files /dev/null and b/content/attachments/Pasted image 20240623185850.png differ
diff --git a/content/attachments/Pasted image 20240623193335.png b/content/attachments/Pasted image 20240623193335.png
new file mode 100644
index 0000000..b7ac957
Binary files /dev/null and b/content/attachments/Pasted image 20240623193335.png differ
diff --git a/content/attachments/Pasted image 20240727182314.png b/content/attachments/Pasted image 20240727182314.png
new file mode 100644
index 0000000..94f03f7
Binary files /dev/null and b/content/attachments/Pasted image 20240727182314.png differ
diff --git a/content/attachments/Pasted image 20240727183013.png b/content/attachments/Pasted image 20240727183013.png
new file mode 100644
index 0000000..753580f
Binary files /dev/null and b/content/attachments/Pasted image 20240727183013.png differ
diff --git a/content/attachments/Pasted image 20240728181144.png b/content/attachments/Pasted image 20240728181144.png
new file mode 100644
index 0000000..a5a8cde
Binary files /dev/null and b/content/attachments/Pasted image 20240728181144.png differ
diff --git a/content/attachments/Pasted image 20240728181353.png b/content/attachments/Pasted image 20240728181353.png
new file mode 100644
index 0000000..be9141e
Binary files /dev/null and b/content/attachments/Pasted image 20240728181353.png differ
diff --git a/content/attachments/Pasted image 20240728181717.png b/content/attachments/Pasted image 20240728181717.png
new file mode 100644
index 0000000..9115ae0
Binary files /dev/null and b/content/attachments/Pasted image 20240728181717.png differ
diff --git a/content/attachments/Pasted image 20240728182123.png b/content/attachments/Pasted image 20240728182123.png
new file mode 100644
index 0000000..0493ef4
Binary files /dev/null and b/content/attachments/Pasted image 20240728182123.png differ
diff --git a/content/attachments/Pasted image 20240728182851.png b/content/attachments/Pasted image 20240728182851.png
new file mode 100644
index 0000000..7154b81
Binary files /dev/null and b/content/attachments/Pasted image 20240728182851.png differ
diff --git a/content/notes/Building Koi Pond - Simulating Millions of Slack Clients (2023).md b/content/notes/Building Koi Pond - Simulating Millions of Slack Clients (2023).md
new file mode 100644
index 0000000..edb4b4f
--- /dev/null
+++ b/content/notes/Building Koi Pond - Simulating Millions of Slack Clients (2023).md
@@ -0,0 +1,97 @@
+---
+tags:
+ - talks
+created: 2024-06-07
+source: https://www.youtube.com/watch?v=oOCubxI3wmI
+origin: Maude Lemaire
+rating: 3.5
+publish: true
+---
+Maude Lemaire is a Sr. Staff Software Engineer @ Slack and technical lead for the backend performance infrastructure team.
+
+Slack didn't have any load testing tooling. They had a big customer that was putting strain on their infra.
+
+The initial tool (API Blast) just made API requests to the server. It had a few parameters for specifying concurrency, rate limits etc. but that's it.
+- Why did they not use off the shelf tool which already provide these functionalities?
+
+API Blast didn't test the web-socket stack. The only thing that was tested is ingest. Message propagation wasn't tested since there were no incoming clients.
+
+![[Pasted image 20240608170843.png]]
+
+- Edge API is for serving data cached in various Points-of-Presence around the world & required instantaneously. (Like the quick switcher, search bar that comes up on ⌘ + K)
+- Real time services maintain all active web-socket connections with all users around the world. It organizes those connections by channels.
+
+This worked for a while but a newer customer wanted a channel to house all 300K users where 100K users are likely to be active at the same time.
+
+Slack is susceptible to load-related performance problems in 3 key ways:
+- Massive fan-out (users sending messages in big channels )
+- Event floods
+- Thundering herd
+ - Eg: If someone posts a message that goes to 100K active users and even 10 users send 1 reaction, those reactions need to be propagated to 100K clients which leads to a 1 million web-socket events.
+
+In mid-2019, they built a tool called Puppet Show.
+- Simulated real desktop users by spinning up 1000s of headless chrome browsers logged into Slack (distinct users, distinct token) across a K8s cluster.
+- They had a script simulate different actions like switching to a channel & posting a message, switching to another & adding a reaction, trolling Slackbot etc.
+- A central component (Puppeteer) oversaw all the puppets. The puppets would check in regularly to update the puppeteer about their state. Puppets would receive a script from Puppeteer and start executing it.
+ - Sidenote: Don't confuse Puppeteer with the Node.js library used to control Chromium.
+- Pros
+ - High fidelity. Nothing better than logging into the slack client and executing actions.
+ - Flexible scripting using Javascript
+- Cons
+ - Costs a lot. For each puppet instance, the cost was 37 cents per day. Running 100K instances would cost 37K USD everyday.
+ - Spinning up 100K instances took several days and pods would crash frequently.
+- Once it was verified that Slack can handle the load, they stopped using this tool.
+
+They signed up a customer (in 2020, around the pandemic) that wanted support for 500K (IBM probably) users in the same Slack instance.
+
+The headless chrome browsers were replaced w/ lightweight client simulators written in Go.
+- A koi is a Slack client simulation (single Go routine)
+- A school is a collection of koi (single Go program that runs in a single pod on K8s)
+- The keeper manages the schools and keeps track of the overall load test state and parameters.
+
+A JSON configuration file needs to be provided when you boot up a load test that tells a "koi" what to do once booted.
+Each action is mapped to a probability of being performed. Then there's a set of implementation w/ each action that the koi should perform with each of those actions.
+```
+{
+ "behaviors": {
+ "chat.postMessage": {
+ "frequency": 0.043
+ }
+ },
+ "sequences": {
+ "chat.postMessage": {
+ "doc": "Sends a message to a random channel.",
+ "steps": [
+ ...
+ ]
+ }
+ }
+}
+```
+
+Every tick (configurable but 1s by default), a koi runs through its entire configuration and performs the actions based on their odds.
+
+This was good enough to simulate massive fan-out and event floods but not thundering herds since they couldn't simulate coordinated behavior. That's why they have "formations" that allows specifying the percentage of users participating over a period of time.
+```
+{
+ "formations": [
+ {
+ "name": "Populate announcement channel with reactions",
+ "begin_within_secs": 30,
+ "percent": 1.0,
+ "sequence": {
+ "steps": [
+ ...
+ ]
+ }
+ }
+ ]
+}
+```
+
+A koi cost 0.1 cents to run per day.
+## Appendix
+- Slides: https://speakerdeck.com/qcmaude/building-koi-pond-simulating-millions-of-slack-clients
+- https://slack.engineering/%EF%B8%8F-surfs-up-preparing-for-huge-waves-of-traffic-via-load-testing/
+- https://slack.engineering/load-testing-with-koi-pond/
+- https://slack.engineering/continuous-load-testing/
diff --git a/content/notes/CMU Intro to Database Systems (2022).md b/content/notes/CMU Intro to Database Systems (2022).md
index ad005bf..83a6241 100644
--- a/content/notes/CMU Intro to Database Systems (2022).md
+++ b/content/notes/CMU Intro to Database Systems (2022).md
@@ -7,20 +7,21 @@ source: https://15445.courses.cs.cmu.edu/fall2022/
origin: Andy Pavlo
publish: true
---
+Lectures: https://15445.courses.cs.cmu.edu/fall2022/schedule.html
+Assignments: https://15445.courses.cs.cmu.edu/spring2023/assignments.html
+
## 1 : Relational Model & Relational Algebra
Inventor of Relational Model : Edgar Frank "Ted" Codd at IBM
-A Relational Model of Data for Large Shared Data Banks
-Derivability, Redundancy and Consistency of Relations stored in Large Data Banks
+- [A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf)
+- [Derivability, Redundancy and Consistency of Relations stored in Large Data Banks](https://technology.amis.nl/wp-content/uploads/images/RJ599.pdf)
- Relation / Table : Unordered set containing relationship of attributes that represent an entity.
- Tuple / Row / Record : Set of attribute values (domain) in the relation.
- Primary key identifies a single tuple.
- Data Manipulation Languages (DML)
- - Procedural
- - Relational Algebra
- - Declarative (like SQL)
- - Relational Calculus
+ - Procedural : Relational Algebra
+ - Declarative (like SQL) : Relational Calculus
Relational Algebra
- Based on Sets
@@ -31,89 +32,84 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
## 2 : Modern SQL
- SQL is a combination of
- - Data Manipulation Language (DML)
- - retrieve or modify data
- - Data Definition Language (DDL)
- - specify what data looks like, indexes, namespaces, triggers, functions
- - Data Control Language (DCL)
- - access control
-- SQL is not based on sets (no duplicates) but bags (can have duplicates).
-- Nested Queries: ALL, ANY, IN, EXISTS
-- Window Functions: "Sliding" calculations
-- You can do recursion in CTEs
+ - Data Manipulation Language (DML) : retrieve or modify data
+ - Data Definition Language (DDL) : specify what data looks like, indexes, namespaces, triggers, functions
+ - Data Control Language (DCL) : access control
+- SQL is not based on sets (no duplicates) but on bags (can have duplicates).
## 3 : Database Storage 1
- Sequential access can be faster than random access
- Case in point: Think about getting 1MB of data that is contiguously laid out
- Sometimes algorithms might seem slow on paper but we want to optimise for sequential access in DBMS.
-- TODO: Put the other memory levels here
- "Memory" in this course will refer to DRAM.
+
- Disk Oriented DBMS
- Disk: Blocks/Pages
- Memory: Buffer Pool / Page Cache / Buffer Manager
- We don't want to rely on the OS to do anything apart from giving the DBMS memory and not letting the process get killed.
+
+### mmap
- mmap can store contents of a file into the address space of a program but it has some issues:
- - Issues:
- - Transaction Safety : OS can flush dirty pages at any time (you might not want to save changes yet if you're updating multiple pages but the OS will flush the pages and lead to corrupt data)
- - I/O Stalls : DBMS doesn't know which pages are in memory so it'll get a page fault every time it tries to access a page not in memory & be stalled by the OS then.
- - Error Handling : Difficult to validate pages. (DBMS can maintain checksums if it is controlling read, writes unlike mmap)
- - Performance Issues : OS has it's own data structure for page table which can become a bottleneck.
- - mmap might be okay for really light workloads on small DBs. It's quick and easy to start with.
- - mmap is used by elasticsearch, leveldb
- - mmap was partially used by mongodb earlier but they stopped using it since they faced problems
- - sqlite also partially uses mmap
- - For more info on why you shouldn't use mmaps for a DBMS: Read https://db.cs.cmu.edu/mmap-cidr2022/
-- Problem 1 : How the DBMS represents the database in files on disk
- - File Storage
- - A file is a collection of pages.
- - Page is a fixed-size block of data.
- - Page can also have metadata about itself.
- - Different notions of "pages" in a DBMS
- - Hardware : usually 4KB
- - OS : usually 4KB
- - Database : 512B - 16KB
- - A hardware page is the largest block of data that the storage device can guarantee failsafe writes.
- - We want to keep OS pages larger to make less syscalls for read/writes. We also don't want to enlarge it too much since then we'll be fetching unnecessary data.
- - Page Storage Architecture
- - Heap
- - unordered collection of pages
- - support create, get, write, delete, iteration over pages
- - page directory are special pages which track location of data pages in DB files
- - a data pages being self-contained (having metadata about it's location, table etc.) can help in data recovery if something happens to page directory
- - Tree
- - Sequential / Sorted
- - Hashing
- - Some reasons for storing things in small files
- - easier to obtain a lock on specific data
- - reduces the blast radius if a single file gets corrupted or deleted
- - we can symlink particular directories on a faster disk for faster access
- - Page Layout
- - Log Oriented
- - Tuple Oriented
- - Strawman Idea: Keep track of number of tuples in a page and then just append a new tuple to end.
- - Issues
- - Deleting a tuple
- - If a tuple is deleted from between, there's no way to know that free space is available there for putting a tuple in
- - Variable-length attribute
- - Allocating empty space is wasteful
- - Slotted Pages
- - Slot Array in the beginning of the page maps to the starting positions of the tuples. (Also contains tuple size)
- - Slot Array grows from beginning to end. Tuple data grows from end to beginning.
- - We consider the page full when we can't enter any more slots or tuples.
- - Header keeps track of number of used slots.
- - You can readjust tuple locations if you want to on deletion to reduce fragmentation in a particular page.
- - You can use other compaction processes to reduce the amount of space required.
- - Record Ids : DBMS needs a way to keep track of individual tuples. Each tuple has unique record id.
- - simple method: page_id + offset/slot
- - can also contain file location info
- - also: see ctid in postgres (you can query it)
- - VACCUM in pg would does the re-adjustment of deleted rows
- - sqlite has rowid
- - pg's auto-vaccum keeps tracks of pages changes since last Vaccum & then compacts only those pages
- - pages can also be merged together if they're half-full
- - Tuple Layout
- - Tuples also have a header which can store visibility info & bit map for null values in the data attributes.
- - For variable length data, you can also have pointers to some other location stored in the tuple.
+- Issues:
+ - Transaction Safety : OS can flush dirty pages at any time (you might not want to save changes yet if you're updating multiple pages but the OS will flush the pages and lead to corrupt data)
+ - I/O Stalls : DBMS doesn't know which pages are in memory so it'll get a page fault every time it tries to access a page not in memory & be stalled by the OS then.
+ - Error Handling : Difficult to validate pages. (DBMS can maintain checksums if it is controlling read, writes unlike mmap)
+ - Performance Issues : OS has it's own data structure for page table which can become a bottleneck.
+- mmap might be okay for really light workloads on small DBs. It's quick and easy to start with.
+- Sidenote: mmap is used by elasticsearch, leveldb ; sqlite also partially uses mmap ; mmap was partially used by mongodb earlier but they stopped using it since they faced problems
+- For more info on why you shouldn't use mmaps for a DBMS: Read https://db.cs.cmu.edu/mmap-cidr2022/
+
+### How the DBMS represents the database in files on disk
+- File Storage
+ - A file is a collection of pages.
+ - Page is a fixed-size block of data.
+ - Page can also have metadata about itself.
+ - Different notions of "pages" in a DBMS
+ - Hardware : usually 4KB
+ - OS : usually 4KB
+ - Database : 512B - 16KB
+ - A hardware page is the largest block of data that the storage device can guarantee failsafe writes.
+ - We want to keep OS pages larger to make less syscalls for read/writes. We also don't want to enlarge it too much since then we'll be fetching unnecessary data.
+- Page Storage Architecture
+ - Heap
+ - unordered collection of pages
+ - support create, get, write, delete, iteration over pages
+ - page directory are special pages which track location of data pages in DB files
+ - a data pages being self-contained (having metadata about it's location, table etc.) can help in data recovery if something happens to page directory
+ - Tree
+ - Sequential / Sorted
+ - Hashing
+- Some reasons for storing things in small files
+ - easier to obtain a lock on specific data
+ - reduces the blast radius if a single file gets corrupted or deleted
+ - we can symlink particular directories on a faster disk for faster access
+- Page Layout
+ - Log Oriented
+ - Tuple Oriented
+ - Strawman Idea: Keep track of number of tuples in a page and then just append a new tuple to end.
+ - Issues
+ - Deleting a tuple
+ - If a tuple is deleted from between, there's no way to know that free space is available there for putting a tuple in
+ - Variable-length attribute
+ - Allocating empty space is wasteful
+ - Slotted Pages
+ - Slot Array in the beginning of the page maps to the starting positions of the tuples. (Also contains tuple size)
+ - Slot Array grows from beginning to end. Tuple data grows from end to beginning.
+ - We consider the page full when we can't enter any more slots or tuples.
+ - Header keeps track of number of used slots.
+ - You can readjust tuple locations if you want to on deletion to reduce fragmentation in a particular page.
+ - You can use other compaction processes to reduce the amount of space required.
+ - Record Ids : DBMS needs a way to keep track of individual tuples. Each tuple has unique record id.
+ - simple method: page_id + offset/slot
+ - can also contain file location info
+ - also: see ctid in postgres (you can query it)
+ - VACCUM in pg would does the re-adjustment of deleted rows
+ - sqlite has rowid
+ - pg's auto-vaccum keeps tracks of pages changes since last Vaccum & then compacts only those pages
+ - pages can also be merged together if they're half-full
+- Tuple Layout
+ - Tuples also have a header which can store visibility info & bit map for null values in the data attributes.
+ - For variable length data, you can also have pointers to some other location stored in the tuple.
- Misc
- Postgres does MVCC the wrong way:
- To Read: https://db.cs.cmu.edu/papers/2017/p781-wu.pdf
@@ -133,58 +129,58 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- Useless Disk I/O (page may have data we don't care about which we'll need to read since we can't get exact bytes)
- Random Disk I/O (eg: tuples that need to updated may reside on different pages)
- Note to Remember: Sequential I/O is great for non-volatile storage compared to random access.
-- Log Structured Storage
- - What if DBMS couldn't overwrite data in pages & could only create new pages?
- - Eg: AWS S3, HDFS
- - (Think of just a key-value store here, PUT contains the entire value that needs to be stored)
- - DBMS stores log records that contain changes to tuples (PUT, DELETE).
- - Each log record must contain the tuple's unique id.
- - Put records contain the tuple contents.
- - Deletes marks the tuple as deleted.
- - When the page gets full, the DBMS writes it from in-memory page out to disk.
- - Writes are batched (in-memory page is fast, updating 20 tuples at once just involves appending the changes to log now)
- - Getting a tuple with some Id
- - Scanning log from newest to oldest is slow
- - Maintain an index mapping tuple id to newest log record for that id
- - Compaction
- - Merging pages to reduce the amount of space taken up by actions on the same tuple id.
- - TODO: Write a simple script that merges two files with data in the format: ACTION ID Value(if applicable)
- - After a page is compacted, the DBMS doesn't need to maintain temporal ordering of records within the page since each tuple id appears at most once in the page.
- - (We don't need to track deletes post compaction, could just remove the key from index)
- - DBMS can sort page by id to improve lookup. See SSTables.
- - Types of Compaction
- - Universal : combine 2 contiguous sorted files
- - Level : Files of same "level" are compacted one by one as they move up each level. (used by LevelDB & it's fork RocksDB)
- - Eg: RocksDB, LevelDB, Apache HBASE, Fauna, AWS Aurora etc.
- - Downsides
- - write-amplification : same record is getting compacted multiple times even if it doesn't change
- - compaction is expensive
- - Tuple Storage
- - DBMS catalogs contain schema info about tables that is used to figure out tuple's layout.
- - Data Representation
- - Common Types: Integers, Floats or Numeric/Decimal, Varchar or Blob, Time
- - Variable Precision Numbers, Fixed Precision Numbers
- - Handling fixed precision numbers is slow
- - Large values (like Varchar) that are larger than a page use separate **overflow storage pages**. Fields in tuple point to these overflow pages instead of storing the data in the tuple itself.
- - Number of columns is also limited since a tuple can't exceed the size of the page in most DBMS.
- - Some systems allow storing really large value in **external files**. DBMS can't update the contents of these files.
- - Paper Suggestion: 'To Blog or Not to Blob: Large Object Storage in a Database or a Filesystem" by Jim Gray
- - Quotes from Chapter 24 (24.2)
- - "B+-tree indices are not efficient for workloads with a very high number of writes" Why?
- - "The key idea of the log-structured merge tree (LSM tree) is to replace random I/O operations during tree inserts, updates, and deletes with a smaller number of sequen- tial I/O operations."
- - Index Inserts & Lookups
- - "An LSM tree consists of several B+-trees, starting with an in-memory tree, called L0, and on-disk trees L1, L2, ... , Lk for some k, where k is called the level."
- - "An index lookup is performed by using separate lookup operations on each of the trees L0, ... , Lk, and merging the results of the lookups."
- - "When a record is first inserted into an LSM tree, it is inserted into the in-memory B+- tree structure L0. A fairly large amount of memory space is allocated for this tree." How much space though?
- - "As the tree grows to fill the memory allocated to it, we need to move data from the in-memory structure to a B+-tree on disk."
- - If L1 is empty the entire in-memory tree L0 is written to disk to create the initial tree L1.
- - If L1 is not empty, the leaf level of L0 is scanned in increasing key order, and entries are merged with the leaf level entries of L1. The merged entries are used to create a new B+-tree using the bottom-up build process. The new tree with the merged entries then replaces the old L1.
- - All entries in the leaf level of the old L1 tree, including those in leaf nodes that do not have any updates, are copied to the new tree instead of being inserted into the existing L1 tree node.
- - The leaves of the new tree are sequentially located, avoiding random I/O during subsequent merges.
- - The leaves are full, avoiding the overhead of partially occupied leaves that can occur with page splits.
- - "Cost to using the LSM structure: the entire contents of the tree are copied each time a set of entries from L0 are copied into L1."
- - "To ensure we get a benefit for cases where the index size on disk is much bigger than the in-memory index, the maximum size of L1 is chosen as k times the target size of L0, for some k. Similarly, the maximum size of each Li+1 is set to k times the target size of Li. Once a particular Li reaches its maximum size, its entries are merged into the next component Li+1. When Li+1 reaches its target size, its entries are in turn merged into Li+2, and so on."
- - "We assumed for simplicity that when a particular level is full, its entries are entirely merged with the next level. This would result in more I/O load during merges with an unused I/O capacity between merges. To avoid this problem, merging is done on a continuous basis; this is called rolling merge. With **rolling merge**, a few pages of Li are merged into corresponding pages of Li+1 at a time, and removed from Li."
+### Log Structured Storage
+- What if DBMS couldn't overwrite data in pages & could only create new pages?
+ - Eg: AWS S3, HDFS
+- (Think of just a key-value store here, PUT contains the entire value that needs to be stored)
+- DBMS stores log records that contain changes to tuples (PUT, DELETE).
+ - Each log record must contain the tuple's unique id.
+ - Put records contain the tuple contents.
+ - Deletes marks the tuple as deleted.
+- When the page gets full, the DBMS writes it from in-memory page out to disk.
+- Writes are batched (in-memory page is fast, updating 20 tuples at once just involves appending the changes to log now)
+- Getting a tuple with some Id
+ - Scanning log from newest to oldest is slow
+ - Maintain an index mapping tuple id to newest log record for that id
+- Compaction
+ - Merging pages to reduce the amount of space taken up by actions on the same tuple id.
+ - TODO: Write a simple script that merges two files with data in the format: ACTION ID Value(if applicable)
+ - After a page is compacted, the DBMS doesn't need to maintain temporal ordering of records within the page since each tuple id appears at most once in the page.
+ - (We don't need to track deletes post compaction, could just remove the key from index)
+ - DBMS can sort page by id to improve lookup. See SSTables.
+ - Types of Compaction
+ - Universal : combine 2 contiguous sorted files
+ - Level : Files of same "level" are compacted one by one as they move up each level. (used by LevelDB & it's fork RocksDB)
+- Eg: RocksDB, LevelDB, Apache HBASE, Fauna, AWS Aurora etc.
+- Downsides
+ - write-amplification : same record is getting compacted multiple times even if it doesn't change
+ - compaction is expensive
+- Tuple Storage
+ - DBMS catalogs contain schema info about tables that is used to figure out tuple's layout.
+ - Data Representation
+ - Common Types: Integers, Floats or Numeric/Decimal, Varchar or Blob, Time
+ - Variable Precision Numbers, Fixed Precision Numbers
+ - Handling fixed precision numbers is slow
+ - Large values (like Varchar) that are larger than a page use separate **overflow storage pages**. Fields in tuple point to these overflow pages instead of storing the data in the tuple itself.
+ - Number of columns is also limited since a tuple can't exceed the size of the page in most DBMS.
+ - Some systems allow storing really large value in **external files**. DBMS can't update the contents of these files.
+ - Paper Suggestion: 'To Blob or Not to Blob: Large Object Storage in a Database or a Filesystem" by Jim Gray
+### Quotes from Chapter 24 (24.2)
+- "B+-tree indices are not efficient for workloads with a very high number of writes" Why?
+- "The key idea of the log-structured merge tree (LSM tree) is to replace random I/O operations during tree inserts, updates, and deletes with a smaller number of sequen- tial I/O operations."
+- Index Inserts & Lookups
+ - "An LSM tree consists of several B+-trees, starting with an in-memory tree, called L0, and on-disk trees L1, L2, ... , Lk for some k, where k is called the level."
+ - "An index lookup is performed by using separate lookup operations on each of the trees L0, ... , Lk, and merging the results of the lookups."
+ - "When a record is first inserted into an LSM tree, it is inserted into the in-memory B+- tree structure L0. A fairly large amount of memory space is allocated for this tree." How much space though?
+ - "As the tree grows to fill the memory allocated to it, we need to move data from the in-memory structure to a B+-tree on disk."
+ - If L1 is empty the entire in-memory tree L0 is written to disk to create the initial tree L1.
+ - If L1 is not empty, the leaf level of L0 is scanned in increasing key order, and entries are merged with the leaf level entries of L1. The merged entries are used to create a new B+-tree using the bottom-up build process. The new tree with the merged entries then replaces the old L1.
+ - All entries in the leaf level of the old L1 tree, including those in leaf nodes that do not have any updates, are copied to the new tree instead of being inserted into the existing L1 tree node.
+ - The leaves of the new tree are sequentially located, avoiding random I/O during subsequent merges.
+ - The leaves are full, avoiding the overhead of partially occupied leaves that can occur with page splits.
+ - "Cost to using the LSM structure: the entire contents of the tree are copied each time a set of entries from L0 are copied into L1."
+ - "To ensure we get a benefit for cases where the index size on disk is much bigger than the in-memory index, the maximum size of L1 is chosen as k times the target size of L0, for some k. Similarly, the maximum size of each Li+1 is set to k times the target size of Li. Once a particular Li reaches its maximum size, its entries are merged into the next component Li+1. When Li+1 reaches its target size, its entries are in turn merged into Li+2, and so on."
+ - "We assumed for simplicity that when a particular level is full, its entries are entirely merged with the next level. This would result in more I/O load during merges with an unused I/O capacity between merges. To avoid this problem, merging is done on a continuous basis; this is called rolling merge. With **rolling merge**, a few pages of Li are merged into corresponding pages of Li+1 at a time, and removed from Li."
## 5 : Storage Models & Compression
- DB Workloads
@@ -214,34 +210,36 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- Tuple (NSM only)
- Attribute
- Column (DSM only)
-- Naive Compression (DB doesn't know about the data)
- - Things to consider: Computation overhead, Compress vs. Decompress speed
- - MySQL InnoDB Compression
- - Default size is 16KB
- - Pages are compressed & then padded to nearest power of 2 (1, 2, 4, 8 KB)
- - Page goes to buffer pool when it needs to be used
- - Update queries won't necessarily need to know the previous value. If an update just appends the value to the page it can be put after the compressed data in the page & metadata can store offsets for compressed & non-compressed data.
- - DBMS must decompress data first before reading & (potentially) modifying it. Naive schemes also don't consider high-level semantics of the data.
-- Columnar Compression
- - Run-length Encoding
- - data to be sorted for max compression
- - value -> (value, start offset, # of elements in the run)
- - works for highly categorical data (eg: list of values with genders as M/F/Other)
- - Bit-packing
- - store value as smaller data type (eg: int64 -> int8) if the value doesn't need the full size
- - DBs handle the conversion during aggregations. How??? TODO: check in detail
- - A marker is stored for larger values (eg: larger than int8) which maps to a separate lookup table.
- - Bitmap
- - If number of distinct values is low (i.e. value cardinality is low), each unique value can be stored as a bitmap
- - i-th position in bitmap corresponds to i-th tuple
- - Delta
- - Instead of storing all values, record the difference in values
- - Can combine w/ RLE to get better compression ratios if repetition is likely
- - Eg: temperature stored as `99, +1, +1` instead of being stored as actual values `99, 100, 101`
- - Incremental
- - Type of delta compression that avoids storing common prefix / suffix b/w consecutive tuple.
- - Eg: rob, robbed, robbing, robot -> rob, 3bed, 4ing, 3ot (common prefix: "rob" -> "robb" -> "rob")
- - Dictionary
+### Naive Compression
+- DB doesn't know about the data
+- Things to consider: Computation overhead, Compress vs. Decompress speed
+- MySQL InnoDB Compression
+ - Default size is 16KB
+ - Pages are compressed & then padded to nearest power of 2 (1, 2, 4, 8 KB)
+ - Page goes to buffer pool when it needs to be used
+ - Update queries won't necessarily need to know the previous value. If an update just appends the value to the page it can be put after the compressed data in the page & metadata can store offsets for compressed & non-compressed data.
+- DBMS must decompress data first before reading & (potentially) modifying it. Naive schemes also don't consider high-level semantics of the data.
+
+### Columnar Compression
+- Run-length Encoding
+ - data to be sorted for max compression
+ - value -> (value, start offset, # of elements in the run)
+ - works for highly categorical data (eg: list of values with genders as M/F/Other)
+- Bit-packing
+ - store value as smaller data type (eg: int64 -> int8) if the value doesn't need the full size
+ - DBs handle the conversion during aggregations. TODO: check in detail
+ - A marker is stored for larger values (eg: larger than int8) which maps to a separate lookup table.
+- Bitmap
+ - If number of distinct values is low (i.e. value cardinality is low), each unique value can be stored as a bitmap
+ - i-th position in bitmap corresponds to i-th tuple
+- Delta
+ - Instead of storing all values, record the difference in values
+ - Can combine w/ RLE to get better compression ratios if repetition is likely
+ - Eg: temperature stored as `99, +1, +1` instead of being stored as actual values `99, 100, 101`
+- Incremental
+ - Type of delta compression that avoids storing common prefix / suffix b/w consecutive tuple.
+ - Eg: rob, robbed, robbing, robot -> rob, 3bed, 4ing, 3ot (common prefix: "rob" -> "robb" -> "rob")
+- Dictionary
- Map variable length values to a small int identifier
- Lookups for data can use the compressed data instead of the actual value if the DB is aware. For eg: If a string "Andrea" is mapped to an int "30" then the DB can look for that value without decompressing the data.
- This isn't hashing since the dict needs to support both encoding, decoding.
@@ -256,90 +254,92 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- How the DBMS manages its memory and moves data back & forth from disk?
- Goal: Build DB that can store more data than memory available to it.
- Spatial Control (layout of data on device), Temporal Control (when to read from, write data to disk)
-- Buffer Pool Manager / Buffer Manager / Page Cache / Buffer Cache
- - Memory regions organised as array of fixed size pages. Each entry in the array is called frame.
- - When DBMS requests a page, an exact copy is placed into one of these frames.
- - Dirty pages are buffered & not written to disk immediately. (Write back cache)
- - Dirty Page : it is different from its corresponding page on disk.
- - Page table keeps track of pages currently in memory along with metadata like dirty flag, pin/reference counter for each page.
- - Dirty flag tells if the page has been modified by a query & not saved to disk yet.
- - Sidenote: DB waits for entry to be written to the log page before it makes changes in the table page.
- - Pin prevents buffer pool from evicting the page when you're running a query that'll need the page. Pin counter tracks how many queries are using the particular page.
- - Latch can be acquired on page table for a page that'll prevent other thread from using the same page & overwriting it.
- - Lock Vs Latch
- - Lock
- - protects DB's logical contents from other transactions
- - held for transaction duration
- - need to be able to rollback changes
- - Latch
- - protects critical sections of DB's internal data structure from other threads
- - held for operation duration
- - don't need to be able to rollback changes
- - mutex
- - Why not use a hashmap instead of maintaining 2 tables?
- - Prof mentions that the page table is like a hashtable only.
- - Do we lose the contents of buffer pool on crash? Yes
- - Prof mentions that we want to do this for correctness reasons. Why?
- - Page Table Vs Page Directory
- - Page directory is the mapping from page ids to page locations in the DB files. (on disk)
- - Page table is the mapping from page ids to a copy of the page in buffer pool frames. (in memory)
+### Buffer Pool Manager
+- Also knows by : Buffer Manager / Page Cache / Buffer Cache
+- Memory regions organised as array of fixed size pages. Each entry in the array is called frame.
+- When DBMS requests a page, an exact copy is placed into one of these frames.
+- Dirty pages are buffered & not written to disk immediately. (Write back cache)
+ - Dirty Page : it is different from its corresponding page on disk.
+- Page table keeps track of pages currently in memory along with metadata like dirty flag, pin/reference counter for each page.
+ - Dirty flag tells if the page has been modified by a query & not saved to disk yet.
+ - Sidenote: DB waits for entry to be written to the log page before it makes changes in the table page.
+ - Pin prevents buffer pool from evicting the page when you're running a query that'll need the page. Pin counter tracks how many queries are using the particular page.
+ - Latch can be acquired on page table for a page that'll prevent other thread from using the same page & overwriting it.
+ - Lock Vs Latch
+ - Lock
+ - protects DB's logical contents from other transactions
+ - held for transaction duration
+ - need to be able to rollback changes
+ - Latch
+ - protects critical sections of DB's internal data structure from other threads
+ - held for operation duration
+ - don't need to be able to rollback changes
+ - mutex
+ - Why not use a hashmap instead of maintaining 2 tables?
+ - Prof mentions that the page table is like a hashtable only.
+- Do we lose the contents of buffer pool on crash? Yes
+ - Prof mentions that we want to do this for correctness reasons. Why?
+- Page Table Vs Page Directory
+ - Page directory is the mapping from page ids to page locations in the DB files. (on disk)
+ - Page table is the mapping from page ids to a copy of the page in buffer pool frames. (in memory)
- Allocation Policies
- Goal: Evicting, Pre-fetching, Writing optimally
- Global Policies
- Local Policies
-- Buffer Pool Optimisations
- - Multiple Buffer Pools (per DB, table, index and so on)
- - also improves latch contention
- - approaches:
- - maintain an object id that points to specific buffer pool
- - use hashing to select the buffer pool
- - Pre Fetching
- - Sequential Scans
- - Index Scans
- - Scan Sharing / Synchronised Scans
- - different from result caching
- - allow multiple queries to attach to a single cursor that scans a table
- - queries don't need to be the same
- - can also share partial results
- - Buffer Pool Bypass (using private buffer pool)
- - avoids overhead of using the buffer pool (like acquiring latch, getting frames etc.)
- - memory is local to running query (can't be shared across queries)
- - works well if operator needs to read large sequence of contiguous pages on disk
- - can also be used for temporary data
- - OS Page Cache (specific to Postgres)
- - OS maintains its own filesystem cache (aka page or buffer cache)
- - Most DBMS use direct I/O (using O_DIRECT) to bypass OS cache.
- - prevents redundant copies
- - OS has different eviction policies
- - loss of control over file I/O
- - Sidenote: Amazon's pg fork (Aurora) doesn't use OS's page cache.
- - EXPLAIN (BUFFER) (sql statement here)
- - pg_prewarm
-- Buffer Replacement Policies
- - which page to evict from buffer pool when freeing up frames?
- - LRU
- - Clock (like LRU)
- - pages organised in circular buffer
- - each page has a reference bit which is set to 1 when page is accessed
- - when the clock hand moves to the next page
- - if ref bit 0 then evict
- - if ref bit 1 then set to 0
- - Issue with LRU, Clock
- - Susceptible to sequential flooding (query does sequential scan which pollutes the buffer pool with pages that might not be read again)
- - LRU-K
- - tracks history of last K refs to each page as timestamps & compute the interval b/w subsequent access
- - history is used to estimate the next time page is going to be accessed & eviction is done basis that
- - Priority Hints
- - DBMS can provided hints on whether page is important or not to buffer pool
- - Dirty Pages
- - in the buffer pool, if a page
- - isn't dirty: drop it (fast path but might need this page soon)
- - is dirty: need to write back to disk (slow path & might not need page in near future)
- - which to drop? tradeoff b/w fast eviction vs. dirty writing page not needed in future
- - DBMS can periodically walk through the page table and write dirty pages to disk to optimise this
- - Other Memory Pools
- - for sorting+join buffer, query cache, maintenance buffer, log buffer, dictionary cache
- - might not be backed by disk
+
+### Buffer Pool Optimisations
+- Multiple Buffer Pools (per DB, table, index and so on)
+ - also improves latch contention
+ - approaches:
+ - maintain an object id that points to specific buffer pool
+ - use hashing to select the buffer pool
+- Pre Fetching
+ - Sequential Scans
+ - Index Scans
+- Scan Sharing / Synchronised Scans
+ - different from result caching
+ - allow multiple queries to attach to a single cursor that scans a table
+ - queries don't need to be the same
+ - can also share partial results
+- Buffer Pool Bypass (using private buffer pool)
+ - avoids overhead of using the buffer pool (like acquiring latch, getting frames etc.)
+ - memory is local to running query (can't be shared across queries)
+ - works well if operator needs to read large sequence of contiguous pages on disk
+ - can also be used for temporary data
+- OS Page Cache (specific to Postgres)
+ - OS maintains its own filesystem cache (aka page or buffer cache)
+ - Most DBMS use direct I/O (using O_DIRECT) to bypass OS cache.
+ - prevents redundant copies
+ - OS has different eviction policies
+ - loss of control over file I/O
+ - Sidenote: Amazon's pg fork (Aurora) doesn't use OS's page cache.
+ - EXPLAIN (BUFFER) (sql statement here)
+ - pg_prewarm
+### Buffer Replacement Policies
+- Which page to evict from buffer pool when freeing up frames?
+- LRU
+- Clock (like LRU)
+ - pages organised in circular buffer
+ - each page has a reference bit which is set to 1 when page is accessed
+ - when the clock hand moves to the next page
+ - if ref bit 0 then evict
+ - if ref bit 1 then set to 0
+- Issue with LRU, Clock
+ - Susceptible to sequential flooding (query does sequential scan which pollutes the buffer pool with pages that might not be read again)
+- LRU-K
+ - tracks history of last K refs to each page as timestamps & compute the interval b/w subsequent access
+ - history is used to estimate the next time page is going to be accessed & eviction is done basis that
+- Priority Hints
+ - DBMS can provided hints on whether page is important or not to buffer pool
+- Dirty Pages
+ - in the buffer pool, if a page
+ - isn't dirty: drop it (fast path but might need this page soon)
+ - is dirty: need to write back to disk (slow path & might not need page in near future)
+ - which to drop? tradeoff b/w fast eviction vs. dirty writing page not needed in future
+ - DBMS can periodically walk through the page table and write dirty pages to disk to optimise this
+- Other Memory Pools
+ - for sorting+join buffer, query cache, maintenance buffer, log buffer, dictionary cache
+ - might not be backed by disk
## 7 : Hash Tables
- Course Progress (bottoms-up)
@@ -366,182 +366,182 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- Hash Functions
- for any input key, return an integer representation of that key
- why not use SHA or other crypto hash functions? we don't have security concerns since these are keys in an internal data structure, the extra overhead & loss of performance isn't worth it
-- Static Hashing Schemes
- - Linear Probe / Open Address
- - giant table of slots
- - resolves collisions by linearly searching for the next free slot in the table
- - Insertion
- - jump to next free slot, loop around if at end
- - note: lookup stop when you either find the key, loop back to start of search, or find out the location and you know it can't be there (how??)
- - Deletes (handling what happens when you delete a key from a lot)
- - Movement: slide keys up after rehashing (costly)
- - Tombstone: set a marker to indicate that the entry in the slot is logically deleted
- - slot can be re-used for new keys
- - may need periodic garbage collection
- - Non Unique Keys (eg. usage: using hashtable for joins)
- - Separate Linked List : store values in separate storage area for each key
- - Redundant Keys: store duplicate keys entries together in the hash table
- - easier to implement so this is what most systems do
- - check is key exists: find first key
- - deletion: delete all keys
- - Sidenote: name of column and record id together can be used to guarantee uniqueness of keys
- - Robin Hood Hashing
- - stealing slots from "rich" keys and giving them to "poor keys"
- - insertion: a key takes slot from another if the first is farther from its optimal position than the latter
- - each key tracks the no. of positions they're from their optimal position in the table
- - the number of "shifts" from ideal position determines how poor the key is
- - a key inserted at ideal position has a counter of 0
- - similarly a key that had to shift twice has the counter of 2
- - more the counter, poorer the key is
- - if the key being inserted has a larger counter than key already in place then the key already in place is shifted
- - tradeoff: lots of shuffling can occur on insertion but this can make reads slightly faster | linear hashing is faster and easier for most cases
- - Cuckoo Hashing
- - used in a lot of systems
- - use multiple hash tables with different hash function seeds
- - insertion: check every table & pick anyone that has a free slot
- - if no table has a free slot, evict element from one of them and then re-hash it find a new location
- - you need to keep track that you're not in a cycle
- - Good OSS implementation: https://github.com/efficient/libcuckoo
- - What happens if you can't find a new slot using any of the available hash functions? Allocate a new table 2x the size of previous. Re-insert all keys in new hash table.
- - (A new hash function might work too but implementations don't do this)
- - Why is it not considered dynamic if it can allocate a new table?
- - The process is not incremental (eg: think of re-allocating an array in C to insert more elements)
-- Dynamic Hash Tables
- - static hashing schemes require the DBMS to know the number of elements it wants to store
- - table needs to be rebuilt on growth/shrinkage in size
- - resize on demand without complete re-allocation
- - Chained Hashing
- - linked list of buckets for each slot in hash table
- - resolve collisions by placing all elements with the same hash key into the same bucket
- - new bucket is allocated when a bucket is full
- - Extendible Hashing
- - split buckets instead of letting the linked list grow forever
- - multiple slot locations can point to the same location
- - reshuffle bucket entries on split and increase the no. of bits to examine
- - eg: no. of bits increasing : 00, 01, 10, 11 -> 000, 010, 100, 110, 001, 011, 101, 111
- - example of hashed keys: 011100, 10111, 10110
- - Linear Hashing (slightly confusing, read about it somewhere else)
- - hash table maintains a pointer that tracks the next bucket to split
- - when any bucket overflows, split the bucket at the pointer location (not the bucket that overflowed) (in preparation for future)
- - the overflowed bucket is extended
- - use multiple hashes to find the right bucket for a given key
- - new hash function is used at/below the split pointer
- - splitting buckets based on the split pointer will eventually get to all overflowed buckets
- - when the ptr reaches the last slot, delete the first hash function and move back to beginning
- - eg:
- - hash functions: key % n, key % 2n, key % 4n
- - if you started out with 4 bucket lists, you'd have 8 bucket lists after splitting (slide has 5 because of no space)
- - Note: Deletion was skipped in lecture. See slides or notes.
-- Sidenote: Hash table isn't what you want to use for a table index.
-
+### Static Hashing Schemes
+- Linear Probe / Open Address
+ - giant table of slots
+ - resolves collisions by linearly searching for the next free slot in the table
+ - Insertion
+ - jump to next free slot, loop around if at end
+ - note: lookup stop when you either find the key, loop back to start of search, or find out the location and you know it can't be there (how??)
+ - Deletes (handling what happens when you delete a key from a lot)
+ - Movement: slide keys up after rehashing (costly)
+ - Tombstone: set a marker to indicate that the entry in the slot is logically deleted
+ - slot can be re-used for new keys
+ - may need periodic garbage collection
+ - Non Unique Keys (eg. usage: using hashtable for joins)
+ - Separate Linked List : store values in separate storage area for each key
+ - Redundant Keys: store duplicate keys entries together in the hash table
+ - easier to implement so this is what most systems do
+ - check is key exists: find first key
+ - deletion: delete all keys
+ - Sidenote: name of column and record id together can be used to guarantee uniqueness of keys
+- Robin Hood Hashing
+ - stealing slots from "rich" keys and giving them to "poor keys"
+ - insertion: a key takes slot from another if the first is farther from its optimal position than the latter
+ - each key tracks the no. of positions they're from their optimal position in the table
+ - the number of "shifts" from ideal position determines how poor the key is
+ - a key inserted at ideal position has a counter of 0
+ - similarly a key that had to shift twice has the counter of 2
+ - more the counter, poorer the key is
+ - if the key being inserted has a larger counter than key already in place then the key already in place is shifted
+ - tradeoff: lots of shuffling can occur on insertion but this can make reads slightly faster | linear hashing is faster and easier for most cases
+- Cuckoo Hashing
+ - used in a lot of systems
+ - use multiple hash tables with different hash function seeds
+ - insertion: check every table & pick anyone that has a free slot
+ - if no table has a free slot, evict element from one of them and then re-hash it find a new location
+ - you need to keep track that you're not in a cycle
+ - Good OSS implementation: https://github.com/efficient/libcuckoo
+ - What happens if you can't find a new slot using any of the available hash functions? Allocate a new table 2x the size of previous. Re-insert all keys in new hash table.
+ - (A new hash function might work too but implementations don't do this)
+ - Why is it not considered dynamic if it can allocate a new table?
+ - The process is not incremental (eg: think of re-allocating an array in C to insert more elements)
+
+### Dynamic Hash Tables
+- Static hashing schemes require the DBMS to know the number of elements it wants to store
+ - Table needs to be rebuilt on growth/shrinkage in size
+- Resize on demand without complete re-allocation
+- Chained Hashing
+ - linked list of buckets for each slot in hash table
+ - resolve collisions by placing all elements with the same hash key into the same bucket
+ - new bucket is allocated when a bucket is full
+- Extendible Hashing
+ - split buckets instead of letting the linked list grow forever
+ - multiple slot locations can point to the same location
+ - reshuffle bucket entries on split and increase the no. of bits to examine
+ - eg: no. of bits increasing : 00, 01, 10, 11 -> 000, 010, 100, 110, 001, 011, 101, 111
+ - example of hashed keys: 011100, 10111, 10110
+- Linear Hashing (slightly confusing, read about it somewhere else)
+ - hash table maintains a pointer that tracks the next bucket to split
+ - when any bucket overflows, split the bucket at the pointer location (not the bucket that overflowed) (in preparation for future)
+ - the overflowed bucket is extended
+ - use multiple hashes to find the right bucket for a given key
+ - new hash function is used at/below the split pointer
+ - splitting buckets based on the split pointer will eventually get to all overflowed buckets
+ - when the ptr reaches the last slot, delete the first hash function and move back to beginning
+ - eg:
+ - hash functions: key % n, key % 2n, key % 4n
+ - if you started out with 4 bucket lists, you'd have 8 bucket lists after splitting (slide has 5 because of no space)
+ - Note: Deletion was skipped in lecture. See slides or notes.
+- Sidenote: Hash table isn't what you want to use for a table index. Why?
## 8 : B+ Tree Index
-- table indexes mainly use B+ trees
-- table index: replica of a subset of a table's attributes that are organized and/or sorted for efficient access
-- trade-off for no. of indexes to create per DB: storage & maintenance overhead
-- in OLAP DBs: indexes are primarily used for speeding up joins in smaller tables
+- Table indexes mainly use B+ trees
+- Table index: replica of a subset of a table's attributes that are organized and/or sorted for efficient access
+- Trade-off for no. of indexes to create per DB: storage & maintenance overhead
+- In OLAP DBs: indexes are primarily used for speeding up joins in smaller tables
- B-tree family: B-Tree, B+ Tree, B* Tree, Blink-Tree
- pg uses B+ trees, much like other DBs
- Note: A slightly modified kind of B+ tree structure is discussed. (Not the original one from the paper)
- B+ tree is self-balancing tree, keeps data sorted
- - Operations: Search, Sequential Access, Insertion, Deletion in O(logn)
- - Optimized for systems that read & write large blocks of data
- - Node can have > 2 children
- - `M`-way search tree??
- - Properties
- - Perfectly balanced (every lead node is at the same depth in a tree)
- - Every node other than the root is at least half-full
- - `M/2 - 1 <= #keys <= M-1`
- - Every inner node with `k` keys has `k+1` non-null children
- - Some implementations can be slightly different. Doesn't matter as long as the operations are O(logn)
- - B+ tree node comprises of an array of key/value pairs.
- - Keys are derived from attribute(s) that the index is based on.
- - The values will differ based on whether the node is classified as "inner node" or "leaf node".
- - The arrays are usually? kept in sorted order.
- - Inner node can have a key that's deleted since they act like guide posts. Leat nodes can't have deleted keys.
- - Leaf Node Values
- - Approach 1 (Record IDs) : a ptr. to the location of the tuple to which the index entry corresponds
- - extra lookup
- - done by: pg, sql server, db2, oracle
- - Approach 2 (Tuple Data): leaf nodes store actual content of the tuple
- - extra overhead: secondary indexes must store the record IDs as their values
- - done by: sqlite, sql server, mysql, oracle
- - B-tree vs. B+ tree
- - B-tree stores keys & values in all nodes in tree
- - space efficient (since each key only appears once)
- - range scans are expensive compared to B+ tree since leaf nodes are sequential
- - B+ Tree
- - See https://www.cs.usfca.edu/~galles/visualization/BPlusTree.html
- - Max Degree = Max Keys Per Node + 1
- - Operations: Insert, Delete, Lookup
- - DBMS can use a B+ tree index if the query provides any of the attributes of the search key
- - Eg: Index on
- - Conjunction: (a = 1 AND b = 2 AND c = 3)
- - Partial: (a = 1 AND b = 2)
- - Suffix: (b = 2), (c = 3)
- - Find (A, B) ; (A, \*), (\*, A)
- - Eg: Index on
- - Column Values: {A, B, C, D}
- - Search: col2 = B;
- - Duplicate Key Handling
- - Approach 1 : Append Record ID
- - Add tuple's unique Record ID as part of the key to ensure that all keys are unique
- - DBMS can use partial keys to find tuples.
- - pg does this
- - Approach 2 : Overflow Leaf Nodes
- - Let leaf nodes to spill into overflow nodes that contain the duplicate keys.
- - Complex to maintain.
- - Clustered Indexes
- - table is stored in order specified by the primary key
- - some DBs always use a clustered index while others can't use them at all
- - pg doesn't maintain the clustered index once done
- - mysql maintains clustered indexes automatically for primary keys (why can it?)
- - You can specify BTREE or HASH in pg while creating the index
- - Recommended Book on BTrees: Modern BTree Techniques (Goetz Graefe)
- - Design Decisions while Implementing BTrees
- - Node Size
- - as per research: slower the storage device, larger the optimal node size for B+ Tree
- - HDD: ~1MB, SSD: ~10KB, In-Memory: ~512B
- - we want to maximize the sequential IO we're doing
- - can also vary per workload
- - root to leaf traversal : small node sizes
- - range scans : large node sizes
- - Merge Threshold
- - some dbms don't always merge nodes when they're half full & delay it
- - to reduce the amount of re-organization
- - it might be better to let smaller nodes exists & periodically rebuild the entire tree
- - Variable Length Keys
- - Pointers : store key as pointers to the tuple's attribute
- - bad in a disk based system since we'd need to fetch each key using the ptr for comparison while traversing
- - in memory based systems (esp. when there's less memory), this approach prevents duplication
- - a variant of b-tree called t-tree does this
- - Variable Length Nodes
- - requires careful memory management
- - Padding : pad the key to be max length of the key type
- - mysql does this
- - Key Map / Indirection
- - embed an array of pointers that map to the key + value list within the node
- - store an fixed length int instead of the key & map it to the key
- - kinda like dictionary compression
- - Intra Node Search
- - Linear
- - SIMD (Single Instruction/Multiple Data) can be used to speed this up
- - Binary
- - data needs to be sorted for this
- - Interpolation (fastest)
- - approx. location of desired key based on known distribution of keys
- - data should be sorted
- - note: this hasn't been widely implemented outside academia
- - Optimizations
- - Buffer Updates : instead of applying the change right way, keep a log and apply the changes after a while (fractal tree index)
- - Prefix Compression (like incremental compression in Lecture 5)
- - Deduplication : avoid storing multiple copies of same key in leaf nodes; store key once & maintain a list of tuples with that key (similar approach as hash tables)
- - Suffix Truncation: we don't need entire keys in inner nodes, just need a minimum prefix to correctly route probes into the index
- - Pointer Swizzling
- - store ptrs instead of page id to avoid the address lookup from the page table if a page is pinned in the buffer pool
- - some extra metadata needs to be tracked so that we know when a page is unpinned & we can't use the ptr anymore
- - Bulk Insert
- - fastest way to build a new B+ tree for an existing table is to first sort the keys & then build the index from bottom up
+- Operations: Search, Sequential Access, Insertion, Deletion in O(logn)
+- Optimized for systems that read & write large blocks of data
+- Node can have > 2 children
+- `M`-way search tree??
+- Properties
+ - Perfectly balanced (every lead node is at the same depth in a tree)
+ - Every node other than the root is at least half-full
+ - `M/2 - 1 <= #keys <= M-1`
+ - Every inner node with `k` keys has `k+1` non-null children
+ - Some implementations can be slightly different. Doesn't matter as long as the operations are O(logn)
+- B+ tree node comprises of an array of key/value pairs.
+ - Keys are derived from attribute(s) that the index is based on.
+ - The values will differ based on whether the node is classified as "inner node" or "leaf node".
+ - The arrays are usually? kept in sorted order.
+ - Inner node can have a key that's deleted since they act like guide posts. Leat nodes can't have deleted keys.
+- Leaf Node Values
+ - Approach 1 (Record IDs) : a ptr. to the location of the tuple to which the index entry corresponds
+ - extra lookup
+ - done by: pg, sql server, db2, oracle
+ - Approach 2 (Tuple Data): leaf nodes store actual content of the tuple
+ - extra overhead: secondary indexes must store the record IDs as their values
+ - done by: sqlite, sql server, mysql, oracle
+- B-tree vs. B+ tree
+ - B-tree stores keys & values in all nodes in tree
+ - space efficient (since each key only appears once)
+ - range scans are expensive compared to B+ tree since leaf nodes are sequential
+- B+ Tree
+ - See https://www.cs.usfca.edu/~galles/visualization/BPlusTree.html
+ - Max Degree = Max Keys Per Node + 1
+ - Operations: Insert, Delete, Lookup
+ - DBMS can use a B+ tree index if the query provides any of the attributes of the search key
+ - Eg: Index on
+ - Conjunction: (a = 1 AND b = 2 AND c = 3)
+ - Partial: (a = 1 AND b = 2)
+ - Suffix: (b = 2), (c = 3)
+ - Find (A, B) ; (A, \*), (\*, A)
+ - Eg: Index on
+ - Column Values: {A, B, C, D}
+ - Search: col2 = B;
+ - Duplicate Key Handling
+ - Approach 1 : Append Record ID
+ - Add tuple's unique Record ID as part of the key to ensure that all keys are unique
+ - DBMS can use partial keys to find tuples.
+ - pg does this
+ - Approach 2 : Overflow Leaf Nodes
+ - Let leaf nodes to spill into overflow nodes that contain the duplicate keys.
+ - Complex to maintain.
+ - Clustered Indexes
+ - table is stored in order specified by the primary key
+ - some DBs always use a clustered index while others can't use them at all
+ - pg doesn't maintain the clustered index once done
+ - mysql maintains clustered indexes automatically for primary keys (why can it?)
+ - You can specify BTREE or HASH in pg while creating the index
+- Recommended Book on BTrees: Modern BTree Techniques (Goetz Graefe)
+- Design Decisions while Implementing BTrees
+ - Node Size
+ - as per research: slower the storage device, larger the optimal node size for B+ Tree
+ - HDD: ~1MB, SSD: ~10KB, In-Memory: ~512B
+ - we want to maximize the sequential IO we're doing
+ - can also vary per workload
+ - root to leaf traversal : small node sizes
+ - range scans : large node sizes
+ - Merge Threshold
+ - some dbms don't always merge nodes when they're half full & delay it
+ - to reduce the amount of re-organization
+ - it might be better to let smaller nodes exists & periodically rebuild the entire tree
+ - Variable Length Keys
+ - Pointers : store key as pointers to the tuple's attribute
+ - bad in a disk based system since we'd need to fetch each key using the ptr for comparison while traversing
+ - in memory based systems (esp. when there's less memory), this approach prevents duplication
+ - a variant of b-tree called t-tree does this
+ - Variable Length Nodes
+ - requires careful memory management
+ - Padding : pad the key to be max length of the key type
+ - mysql does this
+ - Key Map / Indirection
+ - embed an array of pointers that map to the key + value list within the node
+ - store an fixed length int instead of the key & map it to the key
+ - kinda like dictionary compression
+- Intra Node Search
+ - Linear
+ - SIMD (Single Instruction/Multiple Data) can be used to speed this up
+ - Binary
+ - data needs to be sorted for this
+ - Interpolation (fastest)
+ - approx. location of desired key based on known distribution of keys
+ - data should be sorted
+ - note: this hasn't been widely implemented outside academia
+- Optimizations
+ - Buffer Updates : instead of applying the change right way, keep a log and apply the changes after a while (fractal tree index)
+ - Prefix Compression (like incremental compression in Lecture 5)
+ - Deduplication : avoid storing multiple copies of same key in leaf nodes; store key once & maintain a list of tuples with that key (similar approach as hash tables)
+ - Suffix Truncation: we don't need entire keys in inner nodes, just need a minimum prefix to correctly route probes into the index
+ - Pointer Swizzling
+ - store ptrs instead of page id to avoid the address lookup from the page table if a page is pinned in the buffer pool
+ - some extra metadata needs to be tracked so that we know when a page is unpinned & we can't use the ptr anymore
+ - Bulk Insert
+ - fastest way to build a new B+ tree for an existing table is to first sort the keys & then build the index from bottom up
## 9 : Index Concurrency Control
- Single Threaded Engines: VoltDB (by Andy), Redis
@@ -552,7 +552,7 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- **Physical Correctness** : is the internal representation of the object sound?
- pointers aren't invalid, data is correct etc.
- Focusing on Physical Correctness in this lecture.
-#### Locks vs. Latches
+### Locks vs. Latches
| _ | Locks | Latch |
|-------------|--------------------------------------|---------------------------|
| Separate | User Transactions | Threads |
@@ -593,37 +593,37 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
2. Slot Latches
- each slot has its own latch
-- B+ Tree Concurrency Control
- - Latch Crabbing / Coupling
- - Protocol to allow multiple threads to access / modify B+ Tree at the same time.
- - Get latch for parent
- - Get latch for child
- - Release latch for parent if "safe"
- - Safe node: won't split or merge when updated i.e.
- - not full (for insertion)
- - more than half-full (for deletion)
- - Releasing latches from top-to-bottom allows other threads to proceed with their request faster compared to doing to bottom-to-top.
- - Acquiring a write latch on root node everytime is a major bottleneck.
- - Optimization
- - assume that most modifications to a B+ Tree won't require a split or merge
- - take read latches from top to bottom & when you reach the leaf node then determine if your assumption is correct or not
- - if correct: take write latch on leaf node, perform operation
- - if not: release all latches, start search from start again but use the pessimistic algorithm (discussed above) this time
- - Sidenote: What about lock-free data structures?
- - Don't work that well under high contention.
- - Mentioned: Trying out an implementation of Microsoft's [B-W tree](https://15721.courses.cs.cmu.edu/spring2017/papers/08-oltpindexes2/bwtree-icde2013.pdf) against a B+ Tree. B+ Tree was faster. Also mentioned that skip-list (which is latch free) is also slower than a good B+ Tree.
- - B-W Tree has a separate lookup table with tree node ids & that extra lookup reduces performance.
- - What if threads want to move from one leaf node to another? (Leaf Node Scan)
- - There's potential for deadlocks now compared to when we were just moving top to bottom.
- - For a scan, once we've reached the leaf node level, we can acquire read latches in the same way on siblings (and release them) as we were doing in top->bottom latch crabbing.
- - Consider 2 parallel readers in opposite directions, if they intersect while scanning, it's not an issue because a node can have multiple read latches & once they move across each adjacent node will only have a single read latch.
- - Now consider one scan & another delete operation. The reader can't acquire a read latch on the leaf node with the key that's going to be deleted if the deleter acquired a write latch on it first.
- - We'd wait for very little time and then abort the read.
- - Note: If the reader acquired the read latch earlier on the "to-be-deleted" node then we won't have an issue. After the read, the deleter will delete the node.
- - Suggestion from students: Jitter in timeouts, Drop readers since they're less costly
- - Latches don't support deadlock detection or avoidance.
- - The leaf node sibling latch acquisition protocol must support a "no-wait" mode.
- - DBMS's data structures should cope with failed latch acquisitions.
+### B+ Tree Concurrency Control
+- Latch Crabbing / Coupling
+ - Protocol to allow multiple threads to access / modify B+ Tree at the same time.
+ - Get latch for parent
+ - Get latch for child
+ - Release latch for parent if "safe"
+ - Safe node: won't split or merge when updated i.e.
+ - not full (for insertion)
+ - more than half-full (for deletion)
+ - Releasing latches from top-to-bottom allows other threads to proceed with their request faster compared to doing to bottom-to-top.
+- Acquiring a write latch on root node everytime is a major bottleneck.
+ - Optimization
+ - assume that most modifications to a B+ Tree won't require a split or merge
+ - take read latches from top to bottom & when you reach the leaf node then determine if your assumption is correct or not
+ - if correct: take write latch on leaf node, perform operation
+ - if not: release all latches, start search from start again but use the pessimistic algorithm (discussed above) this time
+ - Sidenote: What about lock-free data structures?
+ - Don't work that well under high contention.
+ - Mentioned: Trying out an implementation of Microsoft's [B-W tree](https://15721.courses.cs.cmu.edu/spring2017/papers/08-oltpindexes2/bwtree-icde2013.pdf) against a B+ Tree. B+ Tree was faster. Also mentioned that skip-list (which is latch free) is also slower than a good B+ Tree.
+ - B-W Tree has a separate lookup table with tree node ids & that extra lookup reduces performance.
+- What if threads want to move from one leaf node to another? (Leaf Node Scan)
+ - There's potential for deadlocks now compared to when we were just moving top to bottom.
+ - For a scan, once we've reached the leaf node level, we can acquire read latches in the same way on siblings (and release them) as we were doing in top->bottom latch crabbing.
+ - Consider 2 parallel readers in opposite directions, if they intersect while scanning, it's not an issue because a node can have multiple read latches & once they move across each adjacent node will only have a single read latch.
+ - Now consider one scan & another delete operation. The reader can't acquire a read latch on the leaf node with the key that's going to be deleted if the deleter acquired a write latch on it first.
+ - We'd wait for very little time and then abort the read.
+ - Note: If the reader acquired the read latch earlier on the "to-be-deleted" node then we won't have an issue. After the read, the deleter will delete the node.
+ - Suggestion from students: Jitter in timeouts, Drop readers since they're less costly
+ - Latches don't support deadlock detection or avoidance.
+ - The leaf node sibling latch acquisition protocol must support a "no-wait" mode.
+ - DBMS's data structures should cope with failed latch acquisitions.
## 10 : Sorting & Aggregations Algorithms
- Disk Manager -> Buffer Pool Manager -> Access Methods (scanning indexes, scanning the table itself) -> **Operator Execution** -> Query Planning
@@ -674,33 +674,33 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- It doesn't effectively utilise additional buffer space (if available).
- Double Buffering Optimization : Prefetch the next run in the background & store it in a 2nd buffer while the system is processing the current run.
- Reduces wait-time for I/O requests at each step by continuously utilizing the disk.
- - General External Merge Sort
- - Pass #0
- - Use ***B*** buffer pages
- - Produce ⌈N / B⌉ sorted runs of size ***B***
- - Pass #1,2,3
- - Merge ***B-1*** runs
- - No. of passes = 1 + ⌈ logB-1⌈N / B⌉⌉
- - Total I/O Cost = 2N * (# of passes)
- - Note: We were ignoring the constant factor B in the 2-way external merge sort
- - Comparison Optimizations
- - Code Specialization: instead of providing a comparison function as a pointer to the sorting algorithm, create a hardcoded version of sort that is specific to a key type.
- - If everything fits in memory then following the ptr to the function is costly.
- - In some systems, you can build out specialized version of the sorting algorithm where comparison functions are hard-coded in the sorting algorithm itself.
- - Mentions Just-In-Time compilation
- - pg does this
- - Suffix Truncation: Compare a binary prefix of long VARCHAR keys instead of slower string comparison. Fallback to slower version if prefixes are equal.
- - Using B+ Trees for Sorting
- - If the table already has an index on the sort attribute(s), we can use it to accelerate sorting by retrieving tuples in desired sort order by traversing the leaf pages of the tree.
- - Cases
- - Clustered B+ Tree
- - Traverse to the left-most leaf page and then retrieve tuples from all leaf pages.
- - Unclustered B+ Tree
- - Chase each pointer to the page that contains the data.
- - Would require random I/O for each data record.
- - Note: From https://learn.microsoft.com/en-us/sql/relational-databases/indexes/clustered-and-nonclustered-indexes-described?view=sql-server-ver16
- - Clustered indexes sort and store the data rows in the table or view based on their key values. These key values are the columns included in the index definition. There can be only one clustered index per table, because the data rows themselves can be stored in only one order.
- - Nonclustered indexes have a structure separate from the data rows. A nonclustered index contains the nonclustered index key values and each key value entry has a pointer to the data row that contains the key value.
+- General External Merge Sort
+ - Pass #0
+ - Use ***B*** buffer pages
+ - Produce ⌈N / B⌉ sorted runs of size ***B***
+ - Pass #1,2,3
+ - Merge ***B-1*** runs
+ - No. of passes = 1 + ⌈ logB-1⌈N / B⌉⌉
+ - Total I/O Cost = 2N * (# of passes)
+ - Note: We were ignoring the constant factor B in the 2-way external merge sort
+- Comparison Optimizations
+ - Code Specialization: instead of providing a comparison function as a pointer to the sorting algorithm, create a hardcoded version of sort that is specific to a key type.
+ - If everything fits in memory then following the ptr to the function is costly.
+ - In some systems, you can build out specialized version of the sorting algorithm where comparison functions are hard-coded in the sorting algorithm itself.
+ - Mentions Just-In-Time compilation
+ - pg does this
+ - Suffix Truncation: Compare a binary prefix of long VARCHAR keys instead of slower string comparison. Fallback to slower version if prefixes are equal.
+- Using B+ Trees for Sorting
+ - If the table already has an index on the sort attribute(s), we can use it to accelerate sorting by retrieving tuples in desired sort order by traversing the leaf pages of the tree.
+ - Cases
+ - Clustered B+ Tree
+ - Traverse to the left-most leaf page and then retrieve tuples from all leaf pages.
+ - Unclustered B+ Tree
+ - Chase each pointer to the page that contains the data.
+ - Would require random I/O for each data record.
+ - From [Clustered and nonclustered indexes](https://learn.microsoft.com/en-us/sql/relational-databases/indexes/clustered-and-nonclustered-indexes-described?view=sql-server-ver16)
+ - Clustered indexes sort and store the data rows in the table or view based on their key values. These key values are the columns included in the index definition. There can be only one clustered index per table, because the data rows themselves can be stored in only one order.
+ - Nonclustered indexes have a structure separate from the data rows. A nonclustered index contains the nonclustered index key values and each key value entry has a pointer to the data row that contains the key value.
### Aggregations
- Collapsing values for a single attribute from multiple tuples into a single scalar value.
@@ -711,37 +711,37 @@ Fun Fact: SQL was Sequel (Structured English Query Language) initially but IBM h
- Goal is to avoid having random I/O going to disk & maximizing sequential access.
- If the SQL statement has a `ORDER BY` clause then it makes more sense to use sorting.
- If the data doesn't need to be sorted then hashing is usually better since it's computationally cheaper.
+
- Hashing Aggregate
- Populate an ephemeral hash table as the DBMS scans the table.
- If everything doesn't fit in memory then we'll need to perform an "External Hashing Aggregate"
- - Assumption: We only need a single pass (of both phases) for now.
- - Phase #1 - Partition
- - Divide tuples into buckets/partitions based on hash key.
- - A partition is one or more pages that contain the set of keys with same hash value.
- - Write them out to disk when they get full.
- - Partitions are spilled to disk via output buffers.
- - Let's say we have **B** buffers.
- - B-1 buffers for the partition
- - 1 buffer for the input data.
- - Possible Optimization: If we have a `DISTINCT` clause, we can avoid duplicate keys when building the initial partitions.
- - Phase #2 - ReHash
- - Build in-memory hash table for each partition & compute the aggregation.
- - For each partition on disk
- - Assumption: The hash table & each partition fits in memory.
- - Hash tables would be comparatively smaller than the entire dataset so they should ideally fit in memory. Even if they do spill-over, it can be dealt with easily.
- - Read it in memory & build an in-memory hash table based of a different hash function.
- - Go through each bucket of this hash table to bring together matching tuples.
- - During the Rehash phase, we want to store pairs in the form:
- - (GroupKey->RunningVal)
- - Thing of the `AVG` clause using COUNT & SUM
- - HashTable would have a key & then value as (count, sum) when could be further aggregated to produce the final result.
+ - Assumption: We only need a single pass (of both phases) for now.
+ - Phase #1 - Partition
+ - Divide tuples into buckets/partitions based on hash key.
+ - A partition is one or more pages that contain the set of keys with same hash value.
+ - Write them out to disk when they get full.
+ - Partitions are spilled to disk via output buffers.
+ - Let's say we have **B** buffers.
+ - B-1 buffers for the partition
+ - 1 buffer for the input data.
+ - Possible Optimization: If we have a `DISTINCT` clause, we can avoid duplicate keys when building the initial partitions.
+ - Phase #2 - ReHash
+ - Build in-memory hash table for each partition & compute the aggregation.
+ - For each partition on disk
+ - Assumption: The hash table & each partition fits in memory.
+ - Hash tables would be comparatively smaller than the entire dataset so they should ideally fit in memory. Even if they do spill-over, it can be dealt with easily.
+ - Read it in memory & build an in-memory hash table based of a different hash function.
+ - Go through each bucket of this hash table to bring together matching tuples.
+ - During the Rehash phase, we want to store pairs in the form:
+ - (GroupKey->RunningVal)
+ - Thing of the `AVG` clause using COUNT & SUM
+ - HashTable would have a key & then value as (count, sum) when could be further aggregated to produce the final result.
+
- Discussed Optimizations for Sorting
- Chunk I/O into large blocks to amortize costs
- Double-buffering to overlap CPU & I/O
- Mentions: in-memory DBMS that assume everything can fit in memory
- pg, MySQL are built from the ground-up to assume that things won't fit in memory
-
-
## 11 : Joins Algorithms
- Since tables in a relational DB are normalised to avoid unnecessary repetition of information, we need to perform joins (using the join operator) to re-construct the "original tuples" without any loss of information.
- Sidenote: For OLAP systems, a/c research 15-50% of the time for a query is spent in joins.
diff --git a/content/notes/CockroachDB's Query Optimizer (2020).md b/content/notes/CockroachDB's Query Optimizer (2020).md
new file mode 100644
index 0000000..03512bc
--- /dev/null
+++ b/content/notes/CockroachDB's Query Optimizer (2020).md
@@ -0,0 +1,320 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-07-27
+updated: 2024-07-30
+source: https://www.youtube.com/watch?v=wHo-VtzTHx0
+origin: Rebecca Taft
+publish: true
+rating: 4
+---
+CockroachDB: Postgres-compatible Geo-Distributed SQL database
+
+Architecture
+- Shared-nothing
+- Consists of distributed SQL layer on top of a distributed KV layer
+
+This talk focuses on the SQL layer.
+
+## Query Optimization in CockroachDB
+- Why not use Postgres (or some other OSS) optimizer?
+ - CockroachDB codebase is written in Go. pg's optimizer is in C. Didn't want the overhead of calling C from Go.
+ - Execution plans are very different in CockroachDB. A plan that performs very well on a single node may perform poorly when distributed across nodes at a large distance.
+ - Optimizer is key to DB performance and using other optimizer won't let them maintain control.
+- CDB's First Optimizer
+ - Not really an optimizer. Used heuristics (rules) to choose execution plan.
+ - Eg: "if an index is available, always use it"
+ - Overtime rules started looking like this:
+ - "always use the index, except when the table is very small or we expect to scan >75% of the rows, or the index is located on a remote machine"
+ - This became difficult to manage.
+ - This kind of optimizer works for OLTP but customers were using CDB for OLAP queries too.
+- Cost-based optimizer
+ - Instead of applying rigid rules, considers multiple alternatives
+ - Assign a cost to each alternative and choose lowest cost option
+ - Cascade-style optimization w/ unified search
+ - How to generate alternatives?
+ - Start w/ default plan from SQL query
+ - Perform a series of transformations (what ?)
+ - Store alternatives in a compact data structure called **memo**
+ - https://www.querifylabs.com/blog/memoization-in-cost-based-optimizers
+ - https://sqlserverperformace.blogspot.com/2020/03/inside-sql-server-query-optimizer-part.html
+
+Q/A
+- Do you do anything like what Postgres does where they've the initial cost that is a cheap approximation and then there's a final cost if you decide to complete the rest of it OR is it a single cost model that produces a single value?
+ - Just a single value right now.
+- Can you do cost model estimations on logical nodes or does it always has to be a physical node?
+ - We've merged the idea of logical and physical nodes. For eg: we don't have the concept of a logical join. A logical join is just a hash join for us.
+## Generating alternative plans
+- Phase of plan generation
+ - *Parse -> Optbuild -> Normalize -> Explore -> DistSQL planning*
+- Sample Query
+```sql
+CREATE TABLE ab (a INT PRIMARY KEY, b INT, INDEX(b));
+CREATE TABLE cd (c INT PRIMARY KEY, d INT);
+SELECT * FROM ab JOIN cd ON b=c WHERE b>1;
+```
+
+### Parsing
+- Parse the SQL query. Uses a yak file similar to pg.
+![[Pasted image 20240727182314.png]]
+
+### Optbuild
+- Takes AST from parser & produces the preliminary query plan.
+```
+ConstructSelect(
+ ConstructInnerJoin(
+ ConstructScan(),
+ ConstructScan(),
+ ConstructFiltersItem(
+ ConstructEq(
+ ConstructVariable(),
+ ConstructVariable(),
+ ),
+ ),
+ ),
+ ConstructFiltersItem(
+ ConstructGt(
+ ConstructVariable(),
+ ConstructConst(),
+ ),
+ ),
+)
+```
+
+- Also does semantic analysis. For eg:
+ - Do tables in the query actually exists and does the current user have the permission to read them?
+ - Do columns exist in those specific table and are they unique?
+ - What columns are selected by `*`?
+ - Do types match for equality comparison?
+- Q/A:
+ - At what point do you try to bind a prepared statement value to a type?
+ - Happens during the optbuild phase.
+
+### Normalization
+- Happens in parallel w/ the optbuild phase. The nested function calls are factory methods generated from a no. of defined normalization rules.
+- Each of the factory functions have a bunch of normalization rules that'll execute and modify the output and create the full normalization plan.
+![[Pasted image 20240727183013.png]]
+- In the image, we've used the fact that `b=c` to infer that if `b>1` then `c>1` and pushed the filter down below the join.
+- Normalization rules
+ - Create a logically equivalent relation expression.
+ - Normalization (or "rewrite") rules are almost always good to apply.
+ - Eg:
+ - Eliminate unnecessary operation: `NOT (NOT) x -> x`
+ - Canonicalize expressions: `5 = x -> x = 5`
+ - Constant folding: `length('abc') -> 3`
+ - Predicate push-down
+ - De-correlation of subqueries
+
+#### DSL : Optgen
+- DSL for representing normalization and exploration rules.
+- Gets compiled to factory function in Go which are called in Optbuild.
+- Examples
+
+1.
+```
+# EliminateNot discards a doubled Not operator // Comment explaining the rule
+
+[EliminateNot, Normalize] // Header with rulename, tag
+(Not (Not $input:*)) // Match rule
+=>
+$input // Replace expression
+```
+
+```go
+// ConstructNot constructs an expression for the Not operator.
+func (_f *Factory) ConstructNot(input opt.ScalarExpr) opt.ScalarExpr {
+ // [EliminateNot]
+ {
+ _not, _ := input.(*memo.NotExpr)
+ if _not != nil {
+ input := _not.Input
+ if _f.matchedRule == nil || _f.matchedRule(opt.EliminateNot) {
+ _expr := input
+ return _expr
+ }
+ }
+ }
+ // ... other rules ...
+ e := _f.mem.MemoizeNot(input)
+ return _f.onConstructScalar(e)
+}
+```
+
+2.
+```
+# MergeSelects combines two nested Select operators into a single Select that
+# ANDs the filter conditions of the two Selects.
+
+[MergeSelects, Normalize]
+(Select (Select $input:* $innerFilters:*) $filters:*)
+=>
+(Select $input (ConcatFilters $innerFilters $filters))
+```
+
+```go
+// [MergeSelects]
+{
+ _select, _ := input.(*memo.SelectExpr)
+ if _select != nil {
+ input := _select.Input
+ innerFilters := _select.Filters
+ if _f.matchedRule == nil || _f.matchedRule(opt.MergeSelects) {
+ _expr := _f.ConstructSelect(
+ input,
+ // DSL allows calling arbitrary Go functions (like ConcatFilters)
+ // defined by them
+ _f.funcs.ConcatFilters(innerFilters, filters),
+ )
+ return _expr
+ }
+ }
+}
+```
+
+Q/A
+- How Go specific is the DSL?
+ - We don't use any Go specific PL features. Can be rewritten for other languages.
+
+### Exploration
+- Exploration rules may or may not produce a better plan so both alternatives are kept around unlike normalization where replacement is done.
+- Same syntax as Normalize rules in the DSL with a different tag (Explore)
+- Eg:
+ - Join reordering: A join (B join C) -> (A join B) join C
+ - Join algorithm (eg. hash join, merge join, lookup join)
+ - Index selection
+
+
+Memo after normalization
+- Memo stores query plan trees. It consists of a series of groups.
+- They also store scalar expressions in the Memo groups but only relational expressions are shown in the image below.
+- Groups can refer to other groups. Eg. Group 1 does an inner join b/w Group 2 and 3.
+
+![[Pasted image 20240728181144.png]]
+
+For exploration, you iterate through the groups and see if any exploration rules match.
+
+GenerateIndexScans created an alternate scan in which we're scanning the secondary index as opposed to the primary index.
+![[Pasted image 20240728181353.png]]
+
+![[Pasted image 20240728181717.png]]
+
+Example from https://www.querifylabs.com/blog/memoization-in-cost-based-optimizers
+![[Pasted image 20240728182123.png]]
+
+The best plan gets passed to the next phase after cost determination.
+
+### DistSQL Planning
+Takes the plan from the optimizer and extends them to whatever the cluster topology is.
+
+![[Pasted image 20240728182851.png]]
+
+> Currently doesn't take advantage of how the tables are laid out on the disk, optimizations that may involve broadcasting some data from a smaller table to another for joins etc. but planned in near future.
+
+## Choosing a Plan
+- Factors that affect cost:
+ - Hardware configuration
+ - Data distribution
+ - Type of operators
+ - Performed benchmarking to understand the relative cost of different operators under the assumption that they won't change much w/ different queries.
+ - This relative cost is hard-coded in the cost model.
+ - No. of rows processed by each operator
+
+**No. of rows processed by each operator**
+- Collect statistics: Row count, Distinct count, Null count, Histogram
+- Multi-column stats are also collected in addition to single column stats
+ - Use indexes to determine what columns to collect multi-column stats on
+ - Eg: for index on (a, b, c), collect multi-column stats on (a,b) and (a,b,c)
+- Stats Collection (`CREATE STATISTICS`): Full Table Scan -> Perform Sampling (size: 10K rows) ; Insert each row into HyperLogLog sketch to calculate distinct count for each column -> Aggregate Samples
+- `CREATE STATISTICS` is automatically run when:
+ - A table is created
+ - A new column or index is added
+ - ~20% of data in table has changed
+- How to determine when 20% of data has changed?
+ - After a mutation on some node, statistic collection will be triggered based on chance
+ - `P(refresh) = no. of rows updated / (no. of rows in table * 0.20)`
+- Always refresh if no stats yet OR a it's been a while since last refresh
+- Each create stats run takes minutes. Full table scan can impact performance. Many table scans at once can bring down the cluster.
+- To minimize performance impact
+ - Run `CREATE STATISTICS` as a job
+ - Guarantees only 1 stats job running at a time
+ - Resilient to node failures
+ - Use throttling to limit CPU utilization by stats job
+
+## Locality-Aware SQL Optimization
+- Customers can optionally duplicate read-mostly data in each locality
+ - Use replication constraints to pin the copies to different geographic regions (eg. US-East, US-West, EU)
+ - i.e. duplicate index keys in the table similar to the primary key
+ - Optimizer includes locality in cost model & automatically selects index from same locality: `primary`, `idx_eu` or `idx_usw`
+```sql
+CREATE TABLE postal_code (
+ id INT PRIMARY KEY,
+ code STRING,
+ INDEX idx_eu (id) STORING (code),
+ INDEX idx_usw (id) STORING (code),
+)
+```
+- Plan queries to use data from the same locality
+- What's next? (see attached slides for more info on this)
+ - Replace duplicated indexes w/ "global tables"
+ - https://www.cockroachlabs.com/blog/global-tables-in-cockroachdb/
+ - Support geo-partitioned unique indexes
+ - Move DistSQL planning into optimizer
+ - Incorporate latency into cost model
+
+## Theory vs. Practice
+
+**Optimizing for OLTP**
+- When they went from the simple heuristic planner to the cost based optimizer, they had to focus a lot of minimizing overhead for simple OLTP queries (eg. primary key lookup) because even though the heuristic planner had problems, it was really fast
+ - Took advantage of logical properties essential for optimization
+ - Cardinality (different from stats)
+ - Functional dependencies
+ - Non-null columns etc.
+- Normalization rules are really important. As of this talk they've 242 normalization rules & 29 exploration rules
+- Foreign key checks & cascades optimized as "post queries"
+ - Foreign key checks (eg: value inserted in a child table that references some other parent table and needs to check whether the value exists in parent table or not) use joins which can be optimized.
+ - Done after the statement is executed (but hasn't returned value yet).
+
+**Join Ordering**
+- v1 was shipped w/o join ordering.
+- Initially implemented w/ 2 rules: CommuteJoin, AssociateJoin
+ - Was really inefficient.
+ - Reordered at most 4 tables by default.
+- An intern implemented DPSUBE from "Guido Moerkotte, Pit Fender, and Marius Eich. 2013. On the correct and complete enumeration of the core search space." and made it more efficient.
+ - Now it orders up to 8 tables by default.
+
+**Query Cache**
+- LRU cache keyed on SQL string
+- Stores optimized memo
+- For prepared statements w. placeholders
+ - Normalized memo is stored
+ - Placeholders are replaced during execution followed by additional normalization and exploration
+
+Other features
+- Optimizer Hints
+ - CockroachDB allows forcing specific index and join types as hints
+- Debugging tools `EXPLAIN ANALYZE (DEBUG) ...`
+ - Generates a bundles with stats, schema, environment variables, the query plan at various verbosities
+
+## Q/A
+- How do you test the optimizer? How do you test if the cost model is working correctly?
+ - We don't prove that the chosen plan is the best plan. We mostly focus on testing the correctness of the plan.
+ - We run benchmarking regularly so that existing queries don't regress.
+- Are input parameters enough to debug everything you need?
+ - It's enough for now. Once we start adding more to the cost model to make it more aware of the cluster topology and data distribution then we'd need more info.
+- Do you run any kind of SQL fuzzer?
+ - Yes. SQLSmith. Mostly for checking if there's any internal error.
+ - Manuel Rigger was testing logical correctness using SQLLancer and opened a bunch of issues.
+ - https://www.manuelrigger.at/dbms-bugs/
+- In general, what's the complexity of queries you're seeing? Cockroach isn't like Snowflake so complex queries might be limited since they're pushed to those systems. Are things limited to TPC-H, TPC-DS?
+ - Complex queries are uncommon.
+- Is there any restriction on rewrite rules on the kind of complexity they're allowed to handle? You mentioned that they can call arbitrary (programmed by them) Go functions.
+ - We don't allow User-Defined functions. All functions are reviewed by some team member.
+
+
+
+## Appendix
+- Slides: https://courses.cs.washington.edu/courses/csep590d/22sp/lectures/UW%20Talk%202022%20CockroachDB's%20Optimizer.pdf (from another event)
+- https://github.com/cockroachdb/cockroach
+- [SIGMOD'20 - CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/pdf/10.1145/3318464.3386134)
+- [How we built a cost-based SQL optimizer](https://www.cockroachlabs.com/blog/building-cost-based-sql-optimizer/)
diff --git a/content/notes/DuckDB - Lambda functions in the duck's nest (2024).md b/content/notes/DuckDB - Lambda functions in the duck's nest (2024).md
new file mode 100644
index 0000000..633397f
--- /dev/null
+++ b/content/notes/DuckDB - Lambda functions in the duck's nest (2024).md
@@ -0,0 +1,117 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-05-23
+source: https://www.youtube.com/watch?v=tcFsgFXV0sM
+origin: Tania Bogatsch
+publish: true
+rating: 3
+---
+
+- Tania is a software engineer at DuckDB Labs. She previously interned at CWI.
+- DuckDB in a nutshell: Analytical database, In-process, Written in C++
+
+## Nested Data
+- Relevance of nested data & lambdas
+ - Nested inputs like JSON and Lists are common
+ - Aggregation aren't intuitive on nested structures
+ - Un-nesting data
+- Recent example: https://til.simonwillison.net/duckdb/remote-parquet
+```sql
+-- reading data from multiple parquet files stored elsewhere using DuckDB
+SELECT SUM(size_total)
+FROM (
+ SELECT SUM(size) as size_total FROM 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/000000.parquet'
+ UNION ALL
+ SELECT SUM(size) as size_total FROM 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/000001.parquet'
+ UNION ALL
+ SELECT SUM(size) as size_total FROM 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/000002.parquet'
+ UNION ALL
+ SELECT SUM(size) as size_total FROM 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/000003.parquet'
+ UNION ALL
+ SELECT SUM(size) as size_total FROM 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/000004.parquet'
+ -- and so on until 000055.parquet
+);
+```
+
+```sql
+-- query using lambda function in DuckDB
+SELECT
+ SUM(size) AS size
+FROM read_parquet(
+ list_transform(
+ generate_series(0, 55),
+ n -> 'https://huggingface.co/datasets/vivym/midjourney-messages/resolve/main/data/' ||
+ format('{:06d}', n) || '.parquet'
+ )
+);
+```
+
+- Nested data isn't typical for relational DBs
+ - Schemas don't support nesting and force users to normalize their data
+ - You'd end up needing to buy/build a specialized analytical system that supports nested data
+ - Note: pg has nested data support but isn't an analytical system.
+
+## How DuckDB Does It
+
+### Execution
+![[Pasted image 20240523233204.png]]
+
+The internal layout for the list type is broken into a vector with offset and length of list and then the selection and child vector with the original data.
+Benefits
+- Vector-at-a-time (operation applied to entire vector leveraging SIMD)
+ - Ideally the vectors fit in L1 cache.
+- Can use tight loops to run computations.
+
+> Sidenote:
+> A tight loop is one which is CPU cache-friendly. It is a loop which fits in the instruction cache, which does no branching, and which effectively hides memory fetch latency for data being processed.
+> Source: https://stackoverflow.com/a/26924484/12531621
+
+
+```sql
+-- tbl:
+┌────────────┬───┐
+│ l ┆ n │
+╞════════════╪═══╡
+│ [7, 12, 5] ┆ 2 │
+│ [1, 11] ┆ 3 │
+└────────────┴───┘
+
+-- SELECT [x + n FOR x IN l IF x < 10] AS result FROM tbl;
+┌────────┐
+│ result │
+╞════════╡
+│ [9, 7] │
+│ [4] │
+└────────┘
+```
+
+- First list has length 2 since one element (12) got filtered out. Similarly, we filtered out the 2nd list and are left with 1 element.
+- Selection vector has also changed. First element (7) is on index 0. Second element (5) is on index 2 and so on.
+
+![[Pasted image 20240524001339.png]]
+
+
+### Parser, Transformer, Binder
+- Parser / Transformer
+ - Parse list comprehension syntax
+ - Transform into scalar function
+ - `SELECT [x + n FOR x IN l IF x < 10] AS result FROM tbl;` ->
+ `SELECT LIST_TRANSFORM(LIST_FILTER(l, y -> y < 10), x -> x + n) AS result FROM tbl;`
+ - Binder
+ - Lambda parameters (`x ->`)
+ - Get the logical type of each parameter
+ - Create a dummy table containing 1 column for each parameter
+ - Lambda expression (`-> x + n`)
+ - Bind the lambda expression like any other expression
+ - Use the dummy table to "pretend" that lambda parameters are columns
+ - Index calculation
+ - During execution : expects vectors to execute on
+ - Indexes express vector location. In `-> x + n` x has index: 0, n has index 1.
+ - Expressions can get more complex with recursion and nested lambda parameters.
+ - Eg: `log(n + x * 2) + y` where y is an outer lambda parameter.
+ - Captures
+ - The lambda expression is removed from the scalar function `LIST_FILTER` and added to a global execution state.
+ - The function parameters are updated. Eg: `LIST_FILTER(l, y -> y < 10)` -> `LIST_FILTER(l, 10)`
+
\ No newline at end of file
diff --git a/content/notes/DuckDB Internals (2023).md b/content/notes/DuckDB Internals (2023).md
index e701106..3bfe075 100644
--- a/content/notes/DuckDB Internals (2023).md
+++ b/content/notes/DuckDB Internals (2023).md
@@ -7,6 +7,7 @@ updated: 2024-03-03
source: https://www.youtube.com/watch?v=bZOvAKGkzpQ
origin: Mark Raasveldt
publish: true
+rating: 4
---
> **Remarks**
> The Unified Vector Format was really neat. It's nice to see the adaptive string format from Umbra being used by DuckDB too.
diff --git a/content/notes/Efficient CSV Parsing - On the Complexity of Simple Things (2024).md b/content/notes/Efficient CSV Parsing - On the Complexity of Simple Things (2024).md
new file mode 100644
index 0000000..4ce36f6
--- /dev/null
+++ b/content/notes/Efficient CSV Parsing - On the Complexity of Simple Things (2024).md
@@ -0,0 +1,149 @@
+---
+tags:
+ - talks
+created: 2024-06-04
+source: https://www.youtube.com/watch?v=YrqSp8m7fmk
+origin: Pedro Holanda
+publish: true
+rating: 2
+---
+- Pedro did his PhD and Post-doc from CWI. He works at DuckDB now. He's worked on ART indexes, zonemaps, index joins, CSV reader etc. for DuckDB.
+
+## Early On (2018)
+- The CSV parser was able to read simple well constructed CSV files.
+- Accept main dialect options: Delimiter, Quote, Escape
+- Accept schema definitions: Column Names / Types
+- Issues
+ - Bad Performance in Line Split and Sanitization
+ - Relied on `std::ifstream`
+ - No control on buffer management or file access.
+ - Didn't work for quoted new lines
+
+
+## Buffered CSV Reader + GOTO (2019)
+- Relevant file in PR: https://github.com/duckdb/duckdb/pull/211/files#diff-ab757987c109f9f46a7cd0876d1971730997734da4abe4ba27402738f457771c
+```cpp
+// Pseudo-Code
+
+normal:
+ /* state: normal parsing state */
+ // this state parses the remainder of a non-quoted value
+ // until we reach a delimiter or newline
+ do {
+ for (; position < buffer_size; position++) {
+ if (buffer[position] == options.delimiter[0]) {
+ // delimiter: end the value and add it to the chunk
+ goto add_value;
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
+ // newline: add row
+ goto add_row;
+ }
+ }
+ } while (ReadBuffer(start, line_start));
+ // file ends during normal scan: go to end state
+ goto final_state;
+
+add_value:
+ ...
+add_row:
+ ...
+in_quotes:
+ ...
+unquote:
+ ...
+handle_escape:
+ ...
+carriage_return:
+ ...
+final_state:
+ ...
+```
+- No dependency on the C library
+ - As per the code, they're using the `algorithm` & `fstream` library. Doesn't that count?
+- Instead of `ifstream`, you've your own file handle which utilises 32MB buffers and provides control of what's in memory or not for efficiency.
+- ~ 4x improvement TPCH-SF10 benchmark with 60M line-items. Went from ~220s to ~55s.
+- Issues
+ - Problematic to debug (no stack information)
+ - Especially due to the use of the goto statement
+ - Brittle (can't handle CSV files that aren't well constructed)
+ - Setting headers, types and dialect (delimiter, quotes, escapes) is difficult
+
+## Woodstock Era (2020)
+- Relevant article: https://duckdb.org/2023/10/27/csv-sniffer.html
+- Focus on automatically reading CSV files.
+ - Have types, names and dialects inferred from data
+- Introduction of options like:
+ - ignore_errors: skip lines that don't confirm w/ found rules
+ - null_padding: mark columns as null in rows w/ missing columns
+ - skip_rows: skip dirty rows (comments etc.) (auto-detected)
+- Things seen in CSV files
+ - Dirty Lines: comments on top or interspersed in the line
+ - Missing Columns
+ - Too Many Columns (extra ; but no data)
+ - Missing Quotes (eg: In line - `The Who; 'Who"s Next; 1971`)
+ - Non-UTF8 characters
+ - Empty Lines
+ - Extra ; at the end of every line (with no data)
+- Only a 10% speed-up on TPCH-SF10 compared to previous version.
+
+## The Parallel State Machine Era (2023-now)
+![[Pasted image 20240605021720.png]]
+
+In a (non well-structured) CSV file, you don't know where a line starts or ends. Eg:
+CSV File:
+```
+1;Tenacious D; 'The Pick Of Destiny'; 2006
+2;'Simon \r\n Garfunkel'; 'Sounds of Silence'; 1966
+3;Sir. Jon Lajoie; 'You want some of this?'; 2009
+4;The Who; 'Who's Next'; 1971
+```
+
+We've information about the schema of the CSV file from the CSV sniffer.
+```
+Bigint | Varchar | Varchar | Date
+```
+
+Buffers:
+- Buffer #1: `1;Tenacious D; 'The Pick Of Destiny'; 2006\r\n2;'Simon\r\n Garfunkel`
+- Buffer #2: `'; 'Sounds of Silence', 1966 \r\n 3;Sir. Jon Lajoie; 'You want some of`
+- Buffer #3: `this?'; 2009\r\n 4;The Who; 'Who's Next'; 1971`
+
+Let's say 4 threads (Thread 0 - Thread 3) are allotted parts to read by the global scanner (split shown using ` |` ).
+```
+1;Tenacious D; 'Th | e Pick Of Destiny | '; 2006\r\n2;'S | imon\r\n Garfunkel
+```
+
+Threads can cross over boundaries (in and across buffers) if needed. They also use the schema to determine if the particular line they read is valid or not.
+
+### State Machine
+The parser uses a state machines similar to this: (States x Transition Trigger)
+
+| _States_ | A..Z, 0..9 | , | " | \ | \n |
+| -------------------- | ---------- | --------- | -------- | -------- | ---------------- |
+| **Standard** | Standard | Delimiter | Standard | Standard | Record Separator |
+| **Delimiter** | Standard | Delimiter | Quote | Standard | Record Separator |
+| **Quote** | Quote | Quote | Unquote | Escape | Quote |
+| **Record Separator** | Standard | Delimiter | Standard | Standard | Record Separator |
+| **Escape** | INVALID | INVALID | Quote | INVALID | INVALID |
+| **Unquote** | INVALID | Delimiter | INVALID | INVALID | Record Separator |
+ Eg: If you're currently in Standard state and come across ",", you'll transition to Delimiter state.
+
+- Improved ease of debugging.
+- Performance: takes 0.1s on the TPCH-SF10 benchmark
+- Some other optimizations
+ - Implicit Casting (casting during read instead of in a separate step when possible)
+ - Projection Pushdown
+
+## Future
+- Detecting Structural Errors (Can be ignore or shown depending upon configuration)
+ - Missing Columns
+ - Extra Columns
+ - Invalid Unicode (`"pedro��"` -> `"pedro"`)
+ - Sidenote: Can also be configured to store rejected values (with file metadata, config and the error with exact row-col position)
+- Async IO
+- CSV Fuzzing / Testing
+- Faster Dialect Detection
+- Caching / Partioning
+ - Multiple readers over the same file start from the same point.
+ - IO, Parsing must happens again. This can be avoided.
+ - Academic work: [NoDB: Efficient Query Execution on Raw Data Files](https://stratos.seas.harvard.edu/files/stratos/files/nodb-cacm.pdf)
\ No newline at end of file
diff --git a/content/notes/Implementing InfluxDB IOx (2023).md b/content/notes/Implementing InfluxDB IOx (2023).md
new file mode 100644
index 0000000..602c64f
--- /dev/null
+++ b/content/notes/Implementing InfluxDB IOx (2023).md
@@ -0,0 +1,104 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-05-24
+source: https://www.youtube.com/watch?v=Y5K2Ik2oo-8
+origin: Andrew Lamb
+updated: 2024-05-25
+rating: 3
+---
+- Andrew Lamb is currently a Staff Engineer at Influx Data & a PMC at Apache Datafusion. He's previously worked at Oracle on their DB server and Vertica on their optimizer.
+- InfluxDB IOx is a new time series database built using Apache Arrow, Parquet, Datafusion and Arrow Flight.
+
+## Why are specialized TSDB needed?
+- Obvious: Specialized for storing data w/ times
+- Schema on Write: new columns can appear or disappear at any time (but the types and column names are still the same), structure of data being streamed doesn't need to be specified upfront
+- High volume, denormalized ingest: a lot of fields (like hostname) might be repeated over many times, important for TSDB to remove that redundancy and make data queryable quickly
+- Rapid data value decay: recent data is very important and importance falls off drastically for old data
+
+Some TSDBs: InfluxDB, Facebook Gorilla, Google Monarch, Timescale, AWS Timestream, Graphite Whispher / Grafana.
+
+## TSDB Architecture
+
+### Classic
+- Log Structured Merge (LSM) tree + Series Index
+ - Primary Key: tags + timestamps -> compressed time series
+- Custom query languages: InfluxQL, Flux, PromQL, LogQL etc.
+ - Querying time series data in SQL is difficult
+- Custom file formats
+- A lot of them are written in Go
+
+### Next
+- Fast / Feature Rich
+ - Robust OLAP query engine implemented in native language (C/C++, Rust)
+ - No series cardinality limits, "infinite" retention, scale out
+- Cloud Native
+ - Tiered Storage: historical data on cheap object store, hot local disk/memory
+ - Disaggregated Compute: split compute/storage to run in K8s
+ - Scalable: Multiple services scale up/down - ingest, query, compaction
+- Ecosystem Compatibility
+ - File Format: open, widely supported
+ - Query Language: support for SQL along with domain specific QL
+ - Client compatibility like JDBC
+
+ > Building database systems is expensive. Most companies need to raise 100s of M$ to hire engineers for building these systems or be attached to a research institute for access to PhD students. Paul Dix (the CTO) decided to build on top of open source components. See [# Apache Arrow, Parquet, Flight and Their Ecosystem are a Game Changer for OLAP](https://www.influxdata.com/blog/apache-arrow-parquet-flight-and-their-ecosystem-are-a-game-changer-for-olap/)
+
+
+## Toolkit for a modern analytic system (OLAP DB)
+
+- File format (persistence) - Parquet
+- Columnar memory representation -> Arrow Arrays
+- Operations (eg. multiply, avg) -> Compute Kernels (provided by Arrow)
+- SQL + extensible query engine -> Arrow Datafusion
+- Network transfer -> Arrow Flight IPC
+- JDBC/ODBC driver, Wire Protocol -> Arrow Flight SQL
+
+### Apache Parquet
+- Columnar file format from 2013 (originally part of Hadoop ecosystem)
+- Defacto interchange format for analytics
+- Good compression for a wide variety of data
+- Large ecosystem of tools and systems
+- Support for many query acceleration "tricks" (projection and filter pushdown)
+ - See [Querying Parquet with Millisecond Latency](https://arrow.apache.org/blog/2022/12/26/querying-parquet-with-millisecond-latency/)
+ - For eg: RowGroups, Data Pages that are filtered out by a query can be skipped using aggregate metadata.
+- Parquet can store tabular or structured data (like JSON).
+ - See [Arrow and Parquet Part 2: Nested and Hierarchical Data using Structs and Lists](https://arrow.apache.org/blog/2022/10/08/arrow-parquet-encoding-part-2/)
+ - Structured data storage uses Dremel inspired [Record Shedding](https://www.joekearney.co.uk/posts/understanding-record-shredding)
+- File Layout
+ - A parquet file consists of Row Groups which contain Column Chunks.
+ - There are about 100K-1M rows? stored in each Column Chunk. Column Chunks are fixed size.
+ - Column Chunks are further made up of data pages.
+ - Metadata Layout
+ - Each parquet file has a footer storing metadata that also stores aggregates like row counts, sizes, min/max apart from the file metadata, location of data page and schema.
+
+ How does IOx use Parquet?
+ - Write Path: Incoming line protocol -> Ingester (Periodically writes data buffer as sorted parquet files to object store) -> Object Store
+ - Read Path: User query -> Querier? -> Reads parquet data from object store and answers queries
+
+### Apache Arrow
+- Nothing novel, just standardized
+- Compute kernels implement a lot of standard operations in a TSDB: https://arrow.apache.org/rust/arrow/compute/kernels/index.html
+
+How does IOx use Arrow
+- Indirectly via DataFusion and Flight
+- Directly via ingest path that parses input line protocol to arrow memory buffers
+ - Line protocol is parser and appended to an in memory buffer (mutable batch) that conforms to the Arrow format. It then gets snapshotted into Record Batches.
+ - Sorted data is periodically written on object store using Datafusion plan using
+- Querier also uses the Record Batches to get recent data not persistent (in object store) yet
+
+### Datafusion
+- Deconstructable "Query Engine" written in Rust using Arrow as its in-memory format
+- SQL and Dataframe API, Query Optimizer, Execution Engine, Support for data sources like CSV, Parquet, JSON, Avro etc.
+- Can be used directly as embedded SQL engine or customised for new system
+- SQL Query or DataFrame + Data Batches (Arrow Record Batches) -> Datafusion -> Data Batches
+- System Architecture
+ - Data Sources (Parquet, CSV etc.) + SQL/Dataframe -> Logical Plans -> Execution Plan -> Arrow Based Optimized Execution Operations (eg. ExpressionEval, HashAggregate, Join, Sort etc.)
+
+How IOx uses DataFusion
+- All queries and parquet creation + compaction run through a unified planning system based on DataFusion + Arrow
+
+## Query Processing in IOx
+![[Pasted image 20240525004430.png]]
+
+Resume at https://youtu.be/Y5K2Ik2oo-8?si=v_gJrv4yUmC1SxPo&t=1998
\ No newline at end of file
diff --git a/content/notes/Litestream - Making Single-Node Deployments Cool Again (2022).md b/content/notes/Litestream - Making Single-Node Deployments Cool Again (2022).md
index 3d96646..8058aca 100644
--- a/content/notes/Litestream - Making Single-Node Deployments Cool Again (2022).md
+++ b/content/notes/Litestream - Making Single-Node Deployments Cool Again (2022).md
@@ -6,6 +6,7 @@ created: 2024-02-29
source: https://www.youtube.com/watch?v=drgriZCRyrQ
origin: Ben Johnson
publish: true
+rating: 3
---
- Ben is the author of [Litestream](https://github.com/benbjohnson/litestream) & the key-value store [BoltDB](https://github.com/boltdb/bolt).
- What Litestream Does
diff --git a/content/notes/Materialize - A Streaming SQL Database Powered by Timely Dataflow (2020).md b/content/notes/Materialize - A Streaming SQL Database Powered by Timely Dataflow (2020).md
new file mode 100644
index 0000000..76d4e4e
--- /dev/null
+++ b/content/notes/Materialize - A Streaming SQL Database Powered by Timely Dataflow (2020).md
@@ -0,0 +1,92 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-06-07
+source: https://www.youtube.com/watch?v=9XTg09W5USM
+origin: Arjun Narayan
+publish: true
+rating: 2
+---
+Arjun is the Co-founder and CEO of Materialize.
+## What is a streaming database?
+- Optimized for view-maintenance on an ongoing basis over streams of already processed txns.
+- Traditionally, most systems were either OLTP or OLAP systems.
+- Old Architecture:
+ - OLTP -> Batch ETL -> OLAP -> SQL -> BI Tools, Reports, Data Mining
+- Ideal Architecture Today:
+ - DB ->
+ - **(Streaming Pipeline)** Streaming CDC -> Stream Processing -> Microservices -> Visualization, Alerting, Monitoring
+ - Some tools for stream processing: Apache Flink, Apache Druid, Kafka Streams
+ - **(Batch Pipeline)** Batch ELT -> OLAP -> SQL -> BI Tools, Reports, Data Mining
+- A lot of tooling is needed to get data from a stream reliably to dashboards, alerts and applications. In essence a lot of these tools are just calculating materialized views over the streamed data. With Materialize:
+ - DB -> Streaming CDC -> Stream Processing -> Materialize -> SQL -> Visualization, Alerting, Monitoring
+
+## The Streaming Ecosystem
+- What's different about online view maintenance (OLVM)?
+ - **Queries run for a long time**
+ - OLTP/OLAP queries are optimized at execution time. You can re-plan for each query.
+ - Streaming queries need to be optimal forever.
+ - Once you create a view, query plan is fixed.
+ - Query planning is much harder since an OLTP system will maintain an evaluation context and can decide to bail and re-run a query but that isn't an option in dataflow engines. It needs to have a static query plan prepared.
+ - Error handling is also more difficult since the DB needs to keep running and making progress.
+ - **Statistics won't save you**
+ - Past performance is not indicative of future results.
+ - Most query optimization literature for OLAP is oriented around the idea of getting good cardinality estimates of your tables and using that to choose the best query plan.
+ - In streaming, you've to be adaptable to fast and slow moving streams which can change the no. of events by a large amount. Sometimes, you might be using data combined from a slow and fast moving stream which further increases the complexity.
+ - Q/A: You'd have a dimension and a fact table where the dimension table is mostly static. So you could have some statistics about the fact table using the dimension table. Or are there no static tables in streaming?
+ - In streaming, people reissue the dimension tables because of updates from batch ETL. This can be really bad for performance if your query plans assumed that the dimension tables are mostly going to be static.
+ - In OLAP, you could just re-plan everything.
+ - Q/A: Do people actually drop the entire dimension table and then load it back that often?
+ - Yeah. It's pretty common.
+ - **Writes are already ordered**
+ - No concurrency control (similar to OLAP) is needed.
+ - **Query patterns are known and (mostly) repeated**
+ - A lot of things can be done ahead of time.
+
+### A Streaming Manifesto
+- SQL : Declarative
+ - A lot of streaming systems don't have full SQL support (arbitrary joins, non-window join conditions etc.) .
+- Do only what's necessary: Reactive to input changes, little to no busywork
+ - Existing streaming processors have massive hardware footprint even if they're ingesting low amounts of data due to complex queries.
+- Joins: No mandatory temporal windows; Arbitrary join conditions
+ - Existing streaming systems require streaming joins be windowed along a temporal dimension (i.e. if input stream is changing over time, the joins is only evaluated over some fixed window)
+ - As per their docs, "*JOINs work over the available history of both streams, which ultimately provides an experience more similar to an RDBMS than other streaming platforms."*
+
+## Materialize : Architecture
+ - Materialize is built on top of timely dataflow and differential dataflow.
+ - https://github.com/TimelyDataflow/timely-dataflow
+ - Streaming compute engine. Scale-out, stateful, cyclic dataflow engine.
+ - Can apply arbitrary operators (written in Rust) on data. These operators are passed timestamps along w/ other input data.
+ - https://github.com/TimelyDataflow/differential-dataflow
+ - Has an opinionated set of differential operators (like Join, Aggregate, Filter, Map, Range)
+ - Range: index building operator that takes care of state management
+ - Materialize on top of these handles client connections, maintains catalogs, streams, views. It does parsing, planning, optimizing of queries and constructing dataflow plans from these queries.
+
+![[Pasted image 20240607164228.png]]
+
+Traditional streaming systems achieve parallelism by sharding on operator.
+- Eg. Multiple workers for "Count" operator in a pipeline.
+- This can get expensive for complex queries even for low data volume since the data passes through each worker for an operator one by one. Some of the operations (eg. "Filter") might even be no-op for a lot of data.
+
+Timely dataflow cooperatively schedules every operator on each worker and shards the entire dataflow graph by key.
+- Timestamps drive the movement of data.
+
+### Building Materialize : Experiences
+- Writing performant dataflow programs is very hard.
+- Efficient state management is the key.
+ - Existing stream processors outsource state management to RocksDB which leads to loss of control over things like when compaction happens.
+- SQL requires 100% coverage.
+ - Q/A: How do you cover the SQL standard entirely?
+ - We don't. We support Joins, Subqueries etc. Don't support CTEs and a few other things yet.
+
+## Q/A
+- Does this dataflow uses multi-way joins with worst case optimality to do some kind of computations?
+ - We're using multi-way joins but not the algorithm with worst case join optimality .
+- What is the idea of consistency in your system?
+ - We don't do any concurrency control on our end because our inputs are ordered for us and have a timestamp associated with them but our system is essentially maintaining snapshot isolation.
+- Can I combine windows w/ different durations? (Sliding window semantics)
+ - No.
+- What aspect of the implementation in Materialized was the most difficult?
+ - Query planning to get out a static dataflow graph.
+ - Materialized optimizer is written from scratch.
\ No newline at end of file
diff --git a/content/notes/Neon - Serverless PostgreSQL! (2022).md b/content/notes/Neon - Serverless PostgreSQL! (2022).md
new file mode 100644
index 0000000..1a3f145
--- /dev/null
+++ b/content/notes/Neon - Serverless PostgreSQL! (2022).md
@@ -0,0 +1,242 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-03-25
+source: https://www.youtube.com/watch?v=rES0yzeERns
+origin: Heikki Linnakangas
+publish: true
+rating: 5
+---
+Heikki is the co-founder of Neon & a long-time PostgreSQL committer.
+
+## What is Neon?
+- New storage system for Postgres
+- Storage and compute are separated
+- Multi-tenant storage
+ - One storage layer that is shared across all customers and databases
+ - Shared cache
+- Single-tenant compute (Runs in K8s containers / VMs)
+ - Future plan: BYO Postgres with specific version and extensions installed
+- Cheap copy-on-write branching and time-travel query
+- Single-writer system i.e. single primary that's generating log & processing updates at any given time
+- Doesn't try to solve the multi-master problem or conflicts across regions
+- Postgres compatibility
+ - Currently, they've modified low-level storage in Postgres (read, write a page) so that those requests could be sent to their storage
+ - Other things like Planner, Executor, index types, MVCC hasn't been changed
+ - [Postgres Core Changes](https://github.com/neondatabase/neon/blob/3220f830b7fbb785d6db8a93775f46314f10a99b/docs/core_changes.md)
+
+## Separation of Storage and Compute
+![[Pasted image 20240325030634.png]]
+- Compute = PostgreSQL running in a VM
+- Storage = Neon storage system
+ - Written in Rust
+- pg streams WAL to the safekeepers
+ - pg has support for stream replication (used to stream WAL from primary to replica) which they modified to stream the WAL to their storage system.
+- pg reads pages from pageservers over network instead of local disk
+- write() in pg is a no-op. They just throw the page away & it's queried by the storage system (pageserver) using WAL if needed again
+ - Don't need to do traditional checkpointing since writing is a no-op. There's no need to flush everything to disk in Neon.
+ - Still let pg run checkpoint since it performs other functions than just flushing to disk but they don't flush pages to disk.
+ - Local disk is only used for temporary files, sorting etc. Data is wiped when Postgres is restarted.
+- Why separate compute & storage?
+ - Compute can be shut down completely & started quickly
+ - Current startup time is 4s (includes launching the VM or K8s container, setting connection to storage & replying back to client)
+ - Same storage can be shared by multiple read-only nodes
+ - Scale independently
+ - Cloud storage is cheap
+ - Neon uses S3 or S3-compatible stores
+- Q/A : On the server-side when you boot up, do you pre-fetch anything in the buffer pool?
+ - Nothing currently. The storage system does have its own cache so there'll be any data in there if you recently re-started.
+ - There are pg extensions to pre-warm the cache that are compatible with Neon. (Probably [pg-prewarm](https://www.postgresql.org/docs/current/pgprewarm.html))
+ - But this brings in all the pages and not the ones present at last shutdown.
+- Q/A : Is the server cache the WAL or materialized pages?
+ - Both.
+
+### Write Path
+- 3 safe-keeper nodes running at all times
+- Consensus algorithm based on Paxos
+ - Wait for majority to acknowledge txn. commit before sending acknowledgement to original client
+- Ensures durability of recent txn. (This is pretty much what safe-keepers are for)
+- Safe-keepers have local SSDs for storing WAL
+- Detour : Postgres WAL
+ - Classic area-style representation
+ - Physical, stores updated to 8kB pages
+ - Mix of page images & incremental updates
+ - Doesn't store statements
+ - No UNDO log, only REDO
+- Page-servers
+ - Key of the storage system
+ - After log is durable in safe-keepers, it's streamed to page-servers
+ - Processes the WAL into a format (that lets them quickly find WAL records of a particular page) which is then written into immutable files on disk
+ - Uploads files to cloud storage
+ - Keep a copy in page-server for caching
+ - Local SSDs for caching
+- Durability
+ - Recent WAL is made durable in safe-keepers
+ - Older WAL is uploaded to cloud storage in processed format
+ - Page-servers are disposable
+
+### Read Path
+- Page-servers
+ - Replays WAL to reconstruct pages on demand
+ - Can reconstruct any page at any point in time
+ - Keeps all history upto some retention period
+ - Instead of traditional backups & WAL archive, they store data in their own format in cloud for random access
+ - Request for Page from pg -> Page-server finds the last image & replays the log of that single page -> Sends back the page
+- Q/A: Does the page-server wait for the safe-keeper to send it or requests it when it doesn't have a particular log record?
+ - When pg requests the page at a particular LSN, if the page-server doesn't have it yet, it'll wait
+- Q/A: Does pg maintain internally that for page 123, it expects this LSN or is it something you're adding?
+ - We've to add that.
+ - Wasn't needed for correctness. The primary node could request the latest LSN it wrote & it'd be correct but that'd cause a perf. problem because anytime you read anything from the page-server, you'd need to wait to get the latest version and most of the time there were no changes to that version.
+ - We had to add a cache that tracks LSN numbers of pages evicted from cache. (Last 1000 evicted pages w/ LRU)
+
+### Control Plane & Proxy
+![[Pasted image 20240325033014.png]]
+- When the client connects, it first connects to a proxy.
+- Proxy intercepts the connection, performs auth & queries the control plane about the running pg instance and starts one if none is running for the client.
+ - VMs are shut down after 5mins of inactivity.
+- Control plane starts & stops compute nodes
+ - Also provides the web UI & user-facing API for creating DBs, branches etc.
+- Q/A: Is the proxy written from scratch or did you use pgBouncer or something similar?
+ - From scratch. We don't use the proxy for connection pooling. We use it as a pass-through.
+
+## Storage Engine
+- Traditional Point-in-Time Recovery
+ - Take periodic backups and archive WAL to durable storage
+ - To restore
+ - Restore last backup before the point-in-time
+ - Replay all the log
+- Neon does this at page granularity
+ - Keeps backup of individual pages & stores WAL records of these individual pages
+ - WAL contains a mix of full page images & incremental WAL records
+ - pg prefers to write the full image of a page in case of bulk loading, building an index etc. otherwise it'll mostly store the incremental updates
+ - To reconstruct a page version
+ - Find the last image of the page
+ - Replay all the WAL records on top of it
+ - To make this perform:
+ - Page-server reorders & indexes the WAL
+ - Materialize & store additional page images
+ - pg might have a lot of updates for a page so the page-server decides to store some additional page images so that the entire log doesn't have to be played back when the page is queried
+- Q/A
+ - Do you store only the latest version of a page in the page-server or could you materialize multiple ones?
+ - We can reconstruct any page version upto the retention period.
+ - Do you do any compression on the physical storage?
+ - Not currently. We plan to do it.
+ - If you don't set fill-factor right, that's more just like an interaction with auto-vacuum where updates could span multiple pages & you just need the auto-vacuum to clean that up but with this system we're keeping page versions does that mean you get a bunch of write amplification if you don't have your fill-factor knob set right?
+ - fill-factor: how full pages will be packed (in table or index)
+ - Vacuum will create new versions of these pages but that's okay since the WAL records are quire small so Vacuum will create new versions of these pages.
+
+- GetPage@LSN
+ - When pg needs to read a page, it sends a request to page-server : GetPage(RelFileNode, block #, LSN)
+ - Primary node uses "last-evicted LSN" of the page
+ - Last-evicted LSN is loosely tracked for each page
+ - Read-only node can be anchored at an old LSN
+ - For doing a time-travel query, for eg: if you want to recover to the point where you say dropped the table then you launch a pg node and point it to the page-server & you give it the LSN that you want to read the data & pg will send all requests at that specific LSN & you can see the data as it was at that point of time
+ - Read-only node that follows the primary
+ - There's a cache invalidation problem & if you've a read-only replica that's following the primary, the read-only node will still need to follow the WAL from the primary to figure out which pages are being modified because it might've a version of those pages in cache & it needs to throw them away.
+ - From the page-server side, it looks like the read-only node requests the pages with an increasing LSN as it tracks the primary
+### Storage Engine : Key-Value Store
+- Key: relation id + block number + LSN
+ - Relation ID tells which table or index a block belongs to
+- Value: 8kB page or WAL record
+- Metadata key+value pairs are stored for tracking things like relation size
+- Q/A : What's special about the multi-attribute key
+ - We're doing range queries when you request a page at particular LSN. We need to find the last version of the page & it's not a point lookup.
+ - The LSN no. keeps incrementing as we digest new WAL. We don't replace the old value. We add to it & preserve the history too.
+- Inspired by LSM
+- Immutable files
+ - WAL is buffered in memory (in a BTree)
+ - When ~1GB of WAL has accumulated, it's written out to a new layer file (similar to an SSTable in LSMs)
+ - Existing layer files are never modified
+ - Old files can be merged & compacted, by creating new files and deleting old ones
+- All files are uploaded to cloud storage
+ - And downloaded back on-demand, if missing locally
+
+#### Why not use an existing LSM implementation?
+- Need to access history
+ - Used RocksDB in a earlier prototype & use the block number + LSN as the key
+ - Didn't behave very well since when you keep accumulating new versions of a page, you insert new key-value pairs but when you do compaction, you move those existing keys to the next level & so on but we didn't want to do that since we're not going to modify those keys & there's never any tombstone since we don't remove anything. Write amplification was quite bad with this.
+ - Many LSM tree implementation have support for snapshots and the capability to read older versions of key value pair & they typically do that for MVCC & Snapshot Isolation but they don't really expose the functionality. Many of them wouldn't allow using our LSN number or they'd only allow you to take a snapshot & then read all of the data but it wouldn't allow to take a snapshot in history & they'd only keep the snapshot while the system is running.
+- 2 kinds of values: images & deltas (= WAL records)
+- Need to control materialization
+ - Some implementations allowed hooking into the compact/merge operation & re-write some of the keys at that point but not all of the keys.
+- Upload/download from cloud storage
+- Branching for cheap copy-on-write branches
+ - This might've worked with other stores since it's implemented at a higher level in our storage engine.
+ - We create a new storage for each branch & if you fall to the bottom of that storage w/o finding a version of that page, you look at the parents.
+- Written in Rust or another memory-safe language
+ - Since our storage system is multi-tenant, the buffer cache is shared across different DBs belonging to different customers & we don't want to have a segfault or leak data from one DB to another.
+- We already have WAL & many key-value stores come with a WAL which we don't need
+
+## Storage Format
+![[Pasted image 20240325175452.png]]
+- Consists of immutable files called layer files
+- 2 kinds of layer files
+ - Image layer: contains a snapshot of all key-value pairs in a key-range at one LSN
+ - Created in background to speed up access and allow garbage collecting old data
+ - Image layer creation every ~20 seconds
+ - Delta layer: contains all changes in a key and LSN range
+ - If a key wasn't modified, it's not stored
+ - Incoming WAL is written out as delta layers
+- 2-D storage
+ - X/Y : block ID/LSN
+ - Rectangles are delta layers
+ - Horizontal **bars** are image layers
+ - Each file is roughly the same size (~1GB which seems pretty good for dealing w/ cloud storage)
+- Search
+ - To re-construct a page version, GetPage@LSN needs to find the last image of the page & all WAL records on top of it
+ - Search starts at the given block # and LSN, visit layers (downwards) until you find an image
+ - Delta layers may contain images
+ - Search stops at image layers
+ - Search is downwards. Look into the layer file & collect the WAL records for the particular key (if any) and so on until we hit the image layer which contains images of all the pages in the key-range.
+ - We've the last image & the WAL records now which can be replayed
+ - If a full image is found in the delta layer, we can stop the search earlier
+- Processing incoming WAL
+ - New delta layers are added to the top
+ - Logs are re-ordered and stored in their format for faster lookups
+- Compaction
+ - Re-shuffles data in delta layers that contain all of the changes for a larger LSN range but smaller key- range
+ - For better locality in searches
+ - Since you've fewer files & don't need to visit too many layers
+ - Might've got similar benefit with something like a bloom filter (but isn't implemented yet)
+ - To aid in garbage collection (of frequently updated parts)
+ - Mentioned that they aren't entirely sure about compaction for their use-case yet
+- Garbage Collection
+ - Removes old layer files that aren't needed anymore
+- Someone at Neon wrote a tool to visualise the cluster:
+![[Pasted image 20240325181518.png]]
+- Branching
+ - Neon supports cheap, copy-on-write branches & this is how they do backups.
+ - When you read a key on the child branch & it's not found, continue to read it from the parent at the branch point.
+- Open questions for Neon
+ - When to materialize pages pre-emptively?
+ - We don't need to materialize until the compute requests the page.
+ - If you've a workload that doesn't fit into the cache in pg then you'd keep frequently requesting pages it wrote a while ago & it'd affect your latency if we need to do a replay at that point. (It takes a few ms to collect the records & do the replay)
+ - They haven't solved the problem of when they should request a page pre-emptively.
+ - Q/A
+ - When you decommission the compute layer every 5 mins, do you signal the page-server to cleanup the data?
+ - We don't currently.
+ - How do compute servers know which page-servers to read from?
+ - The control plane keeps track of this. There is currently only 1 page-server for 1 database. It's not currently sharded but we plan to do it in future.
+ - Do you find yourself more limited by network bandwidth or disk bandwidth for reading from page-servers?
+ - One thing we ran into was the sequential scan speed. pg relies heavily on OS cache for sequential scans so when you're scanning it'll request page numbers one-by-one in order. The network round-trip for each individual page made this very slow for us so we added pre-fetching support.
+ - In other workloads, we've to face the overhead of reconstructing the pages doing the WAL replay if there were a lot of changes in the page.
+ - If you've a lot of layer files, we had a dumb algorithm for keeping track of what layer files exist which consumed a lot of CPU but we're addressing that with a different data structure there.
+ - When to create image layers?
+ - When to merge delta layers?
+
+## Q/A
+- What % of DBs in Neon are swapped out?
+ - We try to keep everything in the page-server right now. We only rely on swapping out if we have to kill the page-server and reload it. We don't have enough data to need to swap things out. We've had the service running for a few months & that hasn't been enough time to create enough data to swap anything else.
+ - For the compute layers, we've b/w 50 and 100 active computes at any given time. We've about 3000 registered users.
+- You mentioned that it takes 4s if you've a decommissioned instance & you connect to it. How does this compares against serverless Postgres from Amazon?
+ - Amazon doesn't scale down to 0. Don't know how much time it takes for them to startup.
+ - For us, it takes ~1s to spin up the pod, few 100 ms to download a "base backup" (pg database directory w/o the data since the data is downloaded separately). We run some queries to check if pg works. We add a record to internal DB through control-plane for book-keeping to remember that we started this pod. And then there's the round-trip latency going to the client.
+ - pg database directory is needed so that pg can find anything that's not just a table or index.
+ - Goal is to get it down to 1s.
+- Since you modified the pg storage system & pg relies on OS page cache so is there anything you had to change about the assumption that pg makes about having an OS page cache?
+ - The sequential scan was one thing since pg depended on OS to do the read-ahead. We don't get the benefit of the OS page cache.
+
+## Appendix
+- https://github.com/neondatabase/neon
+- https://neon.tech/
\ No newline at end of file
diff --git a/content/notes/Odyssey - PostgreSQL Connection Proxy! (2022).md b/content/notes/Odyssey - PostgreSQL Connection Proxy! (2022).md
index 7605775..3e82a08 100644
--- a/content/notes/Odyssey - PostgreSQL Connection Proxy! (2022).md
+++ b/content/notes/Odyssey - PostgreSQL Connection Proxy! (2022).md
@@ -3,10 +3,11 @@ tags:
- db
- talks
created: 2024-03-06
-updated: 2024-04-07
+updated: 2024-03-07
source: https://www.youtube.com/watch?v=VEYdZL0bU-I
origin: Andrey Borodin
publish: true
+rating: 5
---
- Andrey has worked on the database team at Yandex. Also a Postgres contributor since 2016.
- He has worked on a disaster recovery system for DBs: https://github.com/wal-g/wal-g & a stateless postgres query router: https://github.com/pg-sharding/spqr for sharded pg.
diff --git a/content/notes/PostgreSQL vs. fsync (2019).md b/content/notes/PostgreSQL vs. fsync (2019).md
new file mode 100644
index 0000000..c9e0a28
--- /dev/null
+++ b/content/notes/PostgreSQL vs. fsync (2019).md
@@ -0,0 +1,108 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-07-24
+source: https://www.youtube.com/watch?v=1VWIGBQLtxo
+origin: Tomas Vondra
+publish: true
+rating: 3
+---
+Tomas is a long-time Postgres contributor (and now committer). Has been working with pg for 20 years.
+
+## How pg handles durability?
+
+```
+[ Shared Buffers (managed by PG) ] [ WAL buffers ]
+
+[ Page Cache (kernel managed) ]
+
+[ Data Files ] | [ WAL (uses direct IO, no page cache) ]
+```
+
+- Any changes (Insert / Update / Delete) are written to the WAL using the very small WAL buffer through direct IO.
+- Changes are then done in the shared buffers.
+- On commit, only the transaction log is flushed.
+- Process for checkpointing (happens regularly)
+ - Get current WAL position
+ - Write data to page cache
+ - Call fsync on all files
+ - Delete unnecessary WAL
+
+What if there's an error?
+- on write
+ - possible but not common since the copy is in memory and we can repeat the write from the original copy
+- on fsync
+ - can happen quite easily due to interaction w/ drives, SAN (storage area network) etc.
+ - managed by kernel (pg doesn't have any copies)
+ - can't retry since page cache is managed by the kernel
+
+## Past Expectations
+
+### Expectation 1
+If there's an error during fsync, the next fsync call will try to flush the data from page cache again.
+
+Reality
+- The first fsync can fail with an error and data is discarded from page cache. The next fsync doesn't even try to flush this data again.
+- The exact behavior depends on the filesystem.
+ - ext4 leaves "dirty" data in page cache but the page gets marked as "clean" i.e. unless the data is modified again, it won't be written again making failures unpredictable
+ - xfs & btrfs throws away the data but the page is marked as not up to date (this behavior is better but not POSIX compliant)
+
+
+### Expectation 2
+There may be multiple file descriptors (fd) per file possibly from multiple processes. If the fsync fails in one process, the failure is reported in other processes too. (Eg. someone connects and calls fsync from the console on a running database)
+
+Reality
+- Only the first process (initializing the fsync) gets the error.
+- File may be closed/opened independently i.e. the other process may not see the file descriptor w/ the error.
+- Behavior also depends on kernel version
+ - up to 4.13, some errors may be quietly ignored
+ - 2016 - first process calling fsync gets the error, other processes get nothing
+ - 2017 - both processes get error, newly opened descriptors get nothing
+ - 2018 - both processes get error, newly opened descriptor only gets it when no one saw it yet
+- The reliable way to receive the error is to keep the oldest file descriptor around.
+- But, pg didn't do this so far. It has a small cache for file descriptors (so when one process closes the file descriptor and another process needs the file, it won't need to do a system call)
+- Behavior not limited to Linux
+ - BSD systems behave similarly (with the exception of FreeBSD/Illumos using ZFS)
+
+## Why did it take so long?
+
+**Why not an issue in the past?**
+- Storage was specifically designed for DBs.
+- You built locally connected drives with a RAID controller with write cache and a small battery. It was really reliable.
+- Failures were obvious since the system crashed.
+- I/O errors were permanent rather than transient.
+
+**Why a problem now?**
+- SAN, EBS, NFS
+- Thin provisioning (using virtualization to give appearance of having more resources than available)
+- Transient I/O errors are more common due to thin provisioning and ENOSPC (no space on drive error)
+- Was not noticed earlier.
+ - Mis-attributed to other causes like NFS
+ - Plenty of other sources of data corruption fixed
+
+> Other DBs (like Oracle) manage cache internally and don't use the kernel page cache unlike pg.
+
+**How to fix the issue?**
+- Modify the kernel
+ - Not relevant in the short / mid-term since changing fsync and page cache to retry would take a lot of effort.
+ - Pushback from kerne; developer community.
+ - Many systems have been running old kernel versions for a long time and would be reluctant to update.
+- Trigger PANIC in pg
+ - Make sure we get the error correctly.
+ - Trigger Panic (crash) and recovery from WAL
+ - Has been implemented in pg now. See https://wiki.postgresql.org/wiki/Fsync_Errors
+
+Q/A
+- Would using a pluggable storage engine prevented this error?
+ - No. Pluggable storage engines also use buffered IO w/ the kernel page cache.
+
+
+Appendix
+- https://www.postgresql.org/message-id/flat/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com
+- https://www.postgresql.org/message-id/flat/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
+- https://lwn.net/Articles/752063/
+- https://lwn.net/Articles/724307/
+- https://www.pgcon.org/2018/schedule/events/1270.en.html
+ - https://www.youtube.com/watch?v=74c19hwY2oE&ab_channel=PGCon
+ - https://docs.google.com/presentation/d/1D6wTVgLK701CDzUJ3iwcnp5tVwbSlzCd5aPJAdLbq8Q/edit#slide=id.p
diff --git a/content/notes/Qdrant - Vector Search Engine Internals (2023).md b/content/notes/Qdrant - Vector Search Engine Internals (2023).md
index e674ef4..a30952a 100644
--- a/content/notes/Qdrant - Vector Search Engine Internals (2023).md
+++ b/content/notes/Qdrant - Vector Search Engine Internals (2023).md
@@ -8,10 +8,10 @@ tags:
source: https://www.youtube.com/watch?v=bU38Ovdh3NY
origin: Andrey Vasnetsov
publish: true
+rating: 4
---
- Andrey is co-founder, CTO at Qdrant. Working w/ search engines since 2014.
-- Qdrant : Vector Similarity Search Engine
- - Open-source, Written in Rust
+- Qdrant : Vector Similarity Search Engine, OSS, Written in Rust
## Vector Search : Overview
- You've an encoder (typically a neural network) which can convert some input data into dense vector representations (also called embeddings). They've an interesting property that pair of vectors in vector space which are close to each other usually corresponds to objects which are also similar in some sense.
diff --git a/content/notes/Rockset - Realtime Indexing for Fast Queries on Massive Semi-structured Data (2020).md b/content/notes/Rockset - Realtime Indexing for Fast Queries on Massive Semi-structured Data (2020).md
new file mode 100644
index 0000000..76007c4
--- /dev/null
+++ b/content/notes/Rockset - Realtime Indexing for Fast Queries on Massive Semi-structured Data (2020).md
@@ -0,0 +1,255 @@
+---
+tags:
+ - db
+ - talks
+created: 2024-06-11
+updated: 2024-06-23
+source: https://www.youtube.com/watch?v=YayQfWr3yzo
+origin: Dhruba Borthakur
+rating: 4
+publish: true
+---
+Dhruba is the co-founder and CTO at Rockset. Founding engineer of RocksDB and HDFS. Also worked on HBase, Hive, Andrew File System (AFS).
+
+**Where has data processing been?**
+- 2006-12: Hadoop
+ - Batch processing optimized for efficiency.
+ - Hadoop mainly got popular because it was able to handle a large amount of data.
+- 2012-18: Apache Spark, Kafka
+ - Stream processing optimized for throughput.
+- 2018-now: Rockset
+ - Analytical applications optimized for: data latency, query latency, QPS
+ - data latency: how much duration after the data is produced can you query it
+ - eg of analytical apps: realtime fleet management, game leaderboards
+
+What is Rockset?
+*Realtime indexing on massive datasets for building realtime apps on live data without ETL or pipelines.*
+
+## The Aggregator Leaf Tailer (ALT) Architecture
+
+![[Pasted image 20240611145858.png]]
+
+- Tailers "tail" data from each data stream and translate it into internal format.
+- Data is sent to leaf nodes where its processed and kept ready for queries.
+- The 2-level aggregator serves SQL queries coming from applications or the dashboard.
+- Different from a traditional [Lambda Architecture](https://dataengineering.wiki/Concepts/Lambda+Architecture) & [Kappa Architecture](https://dataengineering.wiki/Concepts/Kappa+Architecture). Follows CQRS pattern (writes are separated from reads). Separating the writes allows them to handle high write throughput without impacting read query latency.
+- Got inspired while building the FB Newsfeed app which needs to look at a lot of data, rank it by relevance and show it.
+- Architecture is completely disaggregated so each component can be scaled independently.
+ - High write volume: increase tailers
+ - Amount of data storage increases: more leaf nodes
+ - More queries: more aggregators
+- Sidenote
+ - Scuba, Nemo at FB also do this. LinkedIn also uses the same architecture.
+ - Ref: [Aggregator Leaf Tailer: An Alternative to Lambda Architecture for Real-Time Analytics](https://rockset.com/blog/aggregator-leaf-tailer-an-architecture-for-live-analytics-on-event-streams/)
+
+### Key Benefits of ALT
+- Makes queries fast: indexing all data at time of ingest
+- Runs complex queries on the fly: Distributed aggregator tier that scales compute and memory resources independently
+- Cost-effective: All components can be independently scaled up or down.
+- Optimizes read & writes in isolation w/ CQRS (write compute vs. read compute separation, not storage separation)
+
+## Converged Indexing
+- Ref: [Converged Index™: The Secret Sauce Behind Rockset's Fast Queries](https://rockset.com/blog/converged-indexing-the-secret-sauce-behind-rocksets-fast-queries/)
+- Rockset is a NoSQL database that accepts JSON, CSV, XML or any other semi-structured data.
+- *Rockset stores every column of every document in a row-based index, column-store index and an inverted index by default.*
+- Q/A: Are the indexes updated one at a time? Could I observe the row-based index being updated before the inverted index and so on?
+ - Rockset doesn't support ACID transactions but updates are atomic i.e. changes to all indexes will be observed at once and not separately.
+- How does converged indexing fit into ALT?
+ - The tailers extract all the fields inside the provided semi-structured data.
+ - The leaf houses Rockset's converged indexes.
+ - The optimizer can pick the index for the fastest query enhancing the performance of aggregators.
+### Internals
+- Built on top of key-value store RocksDB
+- Shreds document into many key-value pairs
+![[Pasted image 20240611155836.png]]
+- Types of keys
+ - R (Row) Keys
+ - Given the id and field, you can find its values & scan through them quickly and recreate the document you stored.
+ - C (Column Store) Keys
+ - Data for a particular column is stored together so you can do vectorization, scanning through all value etc.
+ - The columnar store is also built on top of the key-value store.
+ - S (Inverted Index) Keys
+
+#### Inverted Indexing for Point Lookups
+![[Pasted image 20240611161319.png]]
+- For each value, store documents containing that value.
+- Quickly retrieve a list of document IDs that match a predicate.
+- Note: Rockset supports nested fields like JSON, Arrays.
+
+#### Columnar Store for Aggregations
+![[Pasted image 20240611162107.png]]
+- Store each column separately.
+- Great compression.
+- Only fetch columns the query needs.
+
+#### Query Optimization Examples
+
+Highly Selective Query (will use the inverted index)
+```
+SELECT *
+FROM search_logs
+WHERE keyword = 'hpts'
+AND locale = 'en'
+```
+> Assumption: As per statistics, this query will have very few results
+
+--
+Large Scan (will use Columnar Store)
+```
+SELECT keyword, count(*)
+FROM search_logs
+GROUP BY keyword
+ORDER BY count(*) DESC
+```
+
+
+Q/A
+- Are you storing all the indexes in the same table-space of RocksDB?
+ - RocksDB has column families but we get better performance if we keep data in the same table-space (called column family in RocksDB)
+- Is RocksDB the right tool for this? It seems like you're sacrificing a lot of efficiency? RocksDB supports page-level compression but nothing else.
+ - RocksDB has delta encoding. Overhead for real-life datasets is very less.
+
+### Challenges with Converged Indexing
+- One new record = multiple servers update
+ - In a traditional database w/ term sharding & n indexes, 1 write incurs updates to n different indexes on n servers.
+ - Requires a distributed txn. (paxos, raft) b/w n servers.
+ - **Addressing the Challenge : Doc Sharding**
+ - Rockset doesn't use term sharding but doc sharding
+ - Term sharding: splitting the terms (keywords) in the inverted index across multiple shards, each shard handles a subset of terms
+ - Most traditional DBs do term sharding. They're optimized for throughput and efficiency but not latency.
+ - Doc sharding: all indices for a doc stored on 1 machine
+ - Elasticsearch, Google Search, FB Search use this
+ - Only paper I could find that mentions both the terms: https://www.vldb.org/pvldb/vol12/p709-archer.pdf
+ - Doc sharding means all new keys will only affect a single shard/lead
+ - Updates are durably buffered to a distributed log
+ - Writes are piped through the distributed log and then split basis keys (eg. doc-id)
+ - Leafs only tail the documents in the shards they're responsible for
+ - Disadvantage: Query needs to fan out to all machines and get results even though only 1 of the machines has the data
+ - What prevents them from keeping a metadata store to point to shards?
+
+- One new doc = multiple random writes
+ - Traditional systems use BTree storage structure
+ - Keys are sorted across tables
+ - A single record update with multiple secondary index would incur writes to multiple different locations
+ - Addressing the Challenge : RocksDB LSM
+ - Multiple records updates accumulate in memory and are written into a single SST file.
+ - Keys are sorted b/w SST files via compaction in a background process.
+ - Multiple index updates from multiple docs result in one write to storage.
+
+## Smart Schema SQL
+- What?
+ - Automatic generation of a schema based on the exact fields and types present at the time of ingest. Ingestion is schema-less.
+- Why?
+ - Avoid data pipelines that can cause data latency.
+ - Semi-structured data is complex and messy.
+ - Ingest any semi-structured data (similar to NoSQL)
+- Under the Hood
+ - Type information stored with values, not "columns".
+ - Strongly types queries on dynamically types fields.
+ - Designed for nested semi-structured data.
+
+Eg:
+
+```json
+// Documents
+{
+ "name": "John",
+ "age": 31,
+ "city": "New York"
+}
+
+{
+ "name": "Michael",
+ "age": "notfound",
+ "experiences": [
+ {
+ "title": "XYZ",
+ "years": 4
+ }
+ ]
+}
+
+// collection schema
+
+| field | occurrences | total | type |
+| ----------------------------- | ----------- | ----- | ------ |
+| ["name"] | 2 | 2 | string |
+| ["age"] | 2 | 2 | string |
+| ["age"] | 1 | 2 | int |
+| ["experiences"] | 1 | 1 | array |
+| ["experiences", "*"] | 1 | 1 | object |
+| ["experiences", "*", "title"] | 1 | 1 | string |
+| ["experiences", "*", "years"] | 1 | 1 | int |
+| ["city"] | 1 | 2 | string |
+```
+
+Q/A: Is this an actual table that's stored? No. This is materialized on query. There are type counters maintained on every leaf node. When we need to describe table, it queries all leaf nodes and produces the table using the stored counters.
+
+**Schema Binding at Query Time**
+- Tailers ingest data without pre-defined schemas.
+- Aggregator use the schema to make queries faster.
+
+### Challenges with Smart Schemas
+Ref: https://rockset.com/blog/why-real-time-analytics-requires-both-the-flexibility-of-nosql-and-strict/
+- Additional CPU usage for processing queries
+ - Use type hoisting to reduce CPU required to run queries.
+ - ![[Pasted image 20240623185850.png]]
+ - The `S` type is hoisted at beginning since the values have the same type. If a lot of values have the same type, then Rockset won't have much overhead.
+ - Schemaless is storing the type with every value.
+ - Rockset's query price / performance is on par with strict schema systems.
+- Requires increased disk space to store types
+ - Use field interning to reduce the space required to store schema.
+ - Instead of storing duplicate strings multiple times, the database stores a single copy of each unique string and uses references (or pointers) to that single instance wherever the string appears.
+ - ![[Pasted image 20240623185831.png]]
+ - Instead of `0: S "City"`, it should've been `0 : S"Rancho Santa Margarita`
+
+## Cloud Scaling Architecture
+
+Key insight into economics of cloud
+- Cost of 1 CPU for 100 mins = Cost of 100 CPU for 1 min
+ - Without cloud : statically provision for peak demand
+ - With cloud: dynamically provision for current demand
+- Goal : resources scale up and down as needed to achieve desired performance
+
+Cloud Autoscaling
+- What?
+ - Each tailer, leaf or aggregator can be independently scaled up and down as needed.
+- Why?
+ - No provisioning
+ - Pay for usage
+ - No need to provision for peak capacity
+
+Tailers are easy to scale up since they're stateless. They're scaled up using K8s and AWS autoscaling.
+
+### Scaling Leaf Nodes
+- Scale down : Use durability of cloud storage.
+ - RocksDB Cloud is a layer on top of RocksDB. Every time new SST files get produced, they're pushed to cloud storage (like AWS S3, GCS) by RocksDB Cloud.
+ - No data loss even if all leaf servers crash.
+- Scale up : Use zero-copy clones of rocksdb-cloud
+ - Takes SST files from an existing leaf shard and starts filling it in. It also starts tailing new data that tailers are generating.
+ - It then becomes a part of the query process and queries through the aggregators start coming to the new leaf process.
+ - No peer-to-peer replication needed so this has no performance impact on existing leaf servers.
+- Q/A : Since S3 is eventually consistent, where are you storing S3 keys in your system to do strong consistent reads?
+ - RocksDB has something called a "manifest" inside the DB. The replica reads the manifest and finds what S3 files is part of the database. If the replica doesn't find an S3 file yet, it retries and obtains it.
+
+### Separate write compute from query compute
+Ref: https://rockset.com/blog/remote-compactions-in-rocksdb-cloud/
+![[Pasted image 20240623193335.png]]
+
+## Summary : Analytics Application on Massive Datasets
+- Index rather than partition-and-scan
+- Separate write compute from query compute
+- Optimized for low data latency, low query latency and high QPS
+
+Q/A:
+- The SST files are written to Cloud but not the WAL. Does that mean that the tailers are stateful or do they rely on upstream durability for log replay?
+ - Other than tailers that tail a data source, Rockset also has a write API which writes directly.
+ - Rockset uses a distributed log to make things durable before it hits S3. If you already have data in sources like data-streams (Kafka) or data-lakes then you don't need this durability.
+ - If using the write API to Rockset, it uses a distributed log and does three-way replication for the last 1 or 5mins of logs before it actually hits the S3 storage system.
+
+## Appendix
+- https://github.com/rockset/rocksdb-cloud
+- [Scuba: Diving into Data at Facebook](https://research.facebook.com/publications/scuba-diving-into-data-at-facebook/)
+- [Nemo: Data discovery at Facebook](https://engineering.fb.com/2020/10/09/data-infrastructure/nemo/)
+- Slides from a similar presentation: https://files.devnetwork.cloud/DeveloperWeek/presentations/2020/Dhruba-Borthakur.pdf
+- Rockset was acquired by OpenAI.
\ No newline at end of file
diff --git a/content/notes/Snowflake Database Architecture Overview (2022).md b/content/notes/Snowflake Database Architecture Overview (2022).md
index 4d7e262..da5f457 100644
--- a/content/notes/Snowflake Database Architecture Overview (2022).md
+++ b/content/notes/Snowflake Database Architecture Overview (2022).md
@@ -6,6 +6,7 @@ created: 2024-02-11
source: https://www.youtube.com/watch?v=xnuv6vr8USE
origin: Bowei Chen, Kavinder Dhaliwal, Libo Wang
publish: true
+rating: 4
---
> **Remarks**
> Snowflake has a lot of moving parts so it was great to hear from multiple engineers on different things they've worked on.
diff --git a/content/notes/Umbra - A Disk-Based System with In-Memory Performance (2022).md b/content/notes/Umbra - A Disk-Based System with In-Memory Performance (2022).md
index bf1195a..844c190 100644
--- a/content/notes/Umbra - A Disk-Based System with In-Memory Performance (2022).md
+++ b/content/notes/Umbra - A Disk-Based System with In-Memory Performance (2022).md
@@ -6,6 +6,7 @@ created: 2024-02-26
source: https://www.youtube.com/watch?v=pS2_AJNIxzU
origin: Thomas Neumann
publish: true
+rating: 5
---
> **Remarks**
> Umbra has done a lot optimisations to get in-memory performance on a hybrid system. They adaptive query compiler that uses LLVM & generates machine code is the most interesting part.
diff --git a/content/til/Web Server with TLS in Go.md b/content/til/Web Server with TLS in Go.md
new file mode 100644
index 0000000..9952172
--- /dev/null
+++ b/content/til/Web Server with TLS in Go.md
@@ -0,0 +1,35 @@
+---
+tags:
+ - til
+created: 2024-03-10
+source: https://twitter.com/mholt6/status/1756013624374730773
+publish: true
+---
+
+```go
+package main
+
+import (
+ "github.com/gin-gonic/autotls"
+ "github.com/gin-gonic/autotls"
+)
+
+func main() {
+ router := gin.Default()
+
+ router.GET("/", func (c *gin.Context) {
+ c.String(200, "Hello World!")
+ }
+
+ autotls.Run(router, "example.com")
+}
+```
+
+```
+// From Caddy's Author
+
+certmagic.HTTPS([]string{"example .com"}, mux)
+```
+
+Also see
+[Setting Up Vanity Subdomains for Your SaaS Using Next.js and Caddy](https://logsnag.com/blog/setting-up-vanity-subdomains-for-your-saas-using-caddy)
\ No newline at end of file
diff --git a/content/til/psql shows NULL as empty string.md b/content/til/psql shows NULL as empty string.md
new file mode 100644
index 0000000..1568fd9
--- /dev/null
+++ b/content/til/psql shows NULL as empty string.md
@@ -0,0 +1,34 @@
+---
+tags:
+ - db
+ - til
+created: 2024-03-23
+publish: true
+---
+
+This query returns null and not an empty string but that's not really evident in psql.
+```
+postgres=# SELECT to_char('1997-02-28 10:30:00'::TIMESTAMP, null);
+ to_char
+---------
+
+(1 row)
+```
+
+To change this behaviour, set this configuration
+```
+postgres=# \pset null null
+Null display is "null".
+```
+
+
+Context:
+I was working on https://github.com/apache/arrow-datafusion/pull/9689 and just assumed that the value being returned was an empty string since I didn't see anything when I ran the above SELECT query in psql. I should've verified my assumption like this:
+
+```sql
+SELECT
+ CASE
+ WHEN your_column IS NULL THEN 'Value is NULL'
+ ELSE 'Value is not NULL'
+ END;
+```
\ No newline at end of file