1. (50 min) What's new in ClickHouse 23.2.
2. (10 min) Q&A.
ClickHouse winter release.
— 18 new features ⛄
— 30 performance optimizations ⛷️
— 43 bug fixes ❄️
SELECT
county,
round(avg(price)) AS p,
ntile(4) OVER (ORDER BY p DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS quartile
FROM uk_price_paid
WHERE date >= '2020-01-01'
GROUP BY county
ORDER BY p DESC
┌─county──────────────────────────────┬──────p─┬─quartile─┐
│ GREATER LONDON │ 818994 │ 1 │
│ WINDSOR AND MAIDENHEAD │ 708048 │ 1 │
│ SURREY │ 663735 │ 1 │
│ BUCKINGHAMSHIRE │ 567960 │ 1 │
│ HERTFORDSHIRE │ 547627 │ 1 │
│ OXFORDSHIRE │ 535430 │ 1 │
│ WOKINGHAM │ 525210 │ 1 │
│ SLOUGH │ 512606 │ 1 │
│ BRIGHTON AND HOVE │ 504168 │ 1 │
│ BRACKNELL FOREST │ 503690 │ 1 │
│ BATH AND NORTH EAST SOMERSET │ 478562 │ 1 │
│ WEST BERKSHIRE │ 476371 │ 1 │
│ WEST SUSSEX │ 458705 │ 1 │
│ HAMPSHIRE │ 435968 │ 1 │
│ KENT │ 431307 │ 1 │
│ ESSEX │ 422416 │ 1 │
│ CAMBRIDGESHIRE │ 422201 │ 1 │
│ CITY OF BRISTOL │ 416250 │ 1 │
│ BOURNEMOUTH, CHRISTCHURCH AND POOLE │ 410394 │ 1 │
│ READING │ 409315 │ 1 │
│ BEDFORD │ 405726 │ 1 │
│ MILTON KEYNES │ 404983 │ 1 │
│ RUTLAND │ 402453 │ 1 │
│ CENTRAL BEDFORDSHIRE │ 397968 │ 1 │
│ EAST SUSSEX │ 395728 │ 1 │
│ GLOUCESTERSHIRE │ 391370 │ 1 │
│ DORSET │ 387639 │ 1 │
│ SOUTH GLOUCESTERSHIRE │ 381043 │ 1 │
│ WILTSHIRE │ 378220 │ 1 │
│ THURROCK │ 376069 │ 2 │
│ ISLES OF SCILLY │ 375776 │ 2 │
│ WARWICKSHIRE │ 374423 │ 2 │
│ MONMOUTHSHIRE │ 374188 │ 2 │
│ WEST NORTHAMPTONSHIRE │ 366735 │ 2 │
│ DEVON │ 362632 │ 2 │
│ SOUTHEND-ON-SEA │ 356387 │ 2 │
│ SUFFOLK │ 352806 │ 2 │
│ SWINDON │ 352113 │ 2 │
│ YORK │ 347600 │ 2 │
│ NORTH SOMERSET │ 344954 │ 2 │
│ CORNWALL │ 344812 │ 2 │
│ CHESHIRE EAST │ 340419 │ 2 │
│ LUTON │ 337840 │ 2 │
│ HEREFORDSHIRE │ 335003 │ 2 │
│ SOMERSET │ 331621 │ 2 │
│ LEICESTERSHIRE │ 330577 │ 2 │
│ WORCESTERSHIRE │ 326095 │ 2 │
│ NORTH NORTHAMPTONSHIRE │ 318454 │ 2 │
│ MEDWAY │ 315527 │ 2 │
│ NORTH YORKSHIRE │ 313933 │ 2 │
│ NORTHAMPTONSHIRE │ 309357 │ 2 │
│ CARDIFF │ 306876 │ 2 │
│ THE VALE OF GLAMORGAN │ 306213 │ 2 │
│ SHROPSHIRE │ 304133 │ 2 │
│ NORFOLK │ 302952 │ 2 │
│ CHESHIRE WEST AND CHESTER │ 299911 │ 2 │
│ ISLE OF WIGHT │ 295424 │ 2 │
│ WARRINGTON │ 293347 │ 3 │
│ PORTSMOUTH │ 290390 │ 3 │
│ CITY OF PETERBOROUGH │ 286477 │ 3 │
│ SOUTHAMPTON │ 282539 │ 3 │
│ STAFFORDSHIRE │ 274396 │ 3 │
│ TORBAY │ 270348 │ 3 │
│ POWYS │ 269119 │ 3 │
│ WEST MIDLANDS │ 268282 │ 3 │
│ CITY OF NOTTINGHAM │ 264461 │ 3 │
│ GREATER MANCHESTER │ 263944 │ 3 │
│ LEICESTER │ 261435 │ 3 │
│ CEREDIGION │ 259058 │ 3 │
│ NOTTINGHAMSHIRE │ 258190 │ 3 │
│ ISLE OF ANGLESEY │ 253133 │ 3 │
│ LINCOLNSHIRE │ 250888 │ 3 │
│ DERBYSHIRE │ 248036 │ 3 │
│ WREXHAM │ 245507 │ 3 │
│ NORTHUMBERLAND │ 243114 │ 3 │
│ PEMBROKESHIRE │ 241746 │ 3 │
│ NEWPORT │ 241240 │ 3 │
│ WEST YORKSHIRE │ 238132 │ 3 │
│ EAST RIDING OF YORKSHIRE │ 237599 │ 3 │
│ WREKIN │ 236250 │ 3 │
│ CONWY │ 230032 │ 3 │
│ CITY OF DERBY │ 230025 │ 3 │
│ CUMBRIA │ 229085 │ 3 │
│ SWANSEA │ 226946 │ 3 │
│ CITY OF PLYMOUTH │ 226746 │ 3 │
│ GWYNEDD │ 225271 │ 4 │
│ FLINTSHIRE │ 220821 │ 4 │
│ DENBIGHSHIRE │ 219846 │ 4 │
│ MERSEYSIDE │ 216572 │ 4 │
│ SOUTH YORKSHIRE │ 215204 │ 4 │
│ LANCASHIRE │ 213512 │ 4 │
│ TYNE AND WEAR │ 212214 │ 4 │
│ TORFAEN │ 207631 │ 4 │
│ CARMARTHENSHIRE │ 204374 │ 4 │
│ BRIDGEND │ 202547 │ 4 │
│ STOCKTON-ON-TEES │ 200061 │ 4 │
│ BLACKBURN WITH DARWEN │ 197650 │ 4 │
│ DARLINGTON │ 195512 │ 4 │
│ HALTON │ 193899 │ 4 │
│ NORTH LINCOLNSHIRE │ 192586 │ 4 │
│ CAERPHILLY │ 181940 │ 4 │
│ STOKE-ON-TRENT │ 168334 │ 4 │
│ COUNTY DURHAM │ 167186 │ 4 │
│ NORTH EAST LINCOLNSHIRE │ 159214 │ 4 │
│ REDCAR AND CLEVELAND │ 157677 │ 4 │
│ MIDDLESBROUGH │ 153811 │ 4 │
│ NEATH PORT TALBOT │ 152495 │ 4 │
│ MERTHYR TYDFIL │ 152328 │ 4 │
│ CITY OF KINGSTON UPON HULL │ 151618 │ 4 │
│ RHONDDA CYNON TAFF │ 150496 │ 4 │
│ HARTLEPOOL │ 143567 │ 4 │
│ BLACKPOOL │ 142350 │ 4 │
│ BLAENAU GWENT │ 128275 │ 4 │
└─────────────────────────────────────┴────────┴──────────┘
Developer: lgbo-ustc.
:) SELECT arrayMap(x -> round(x, 3),
arrayJoin(corrMatrix(
randNormal(0, 1, 'first') AS x,
randNormal(0, 1, 'second') AS y,
x + y,
x - y))) AS matrix
FROM numbers(100000)
┌─matrix──────────────────────────┐
│ [ 1, -0.003, 0.707, 0.709] │
│ [-0.003, 1, 0.705, -0.707] │
│ [ 0.707, 0.705, 1, 0.003] │
│ [ 0.709, -0.707, 0.003, 1] │
└─────────────────────────────────┘
Developer: FFFFFFFHHHHHHH from Tencent.
Bonus: covarPopMatrix, covarSampMatrix.
SELECT arrayJoin(corrMatrix(p2000, p2010, p2020))
FROM (
WITH toYear(date) AS y
SELECT
county,
avgIfOrNull(price, y >= 2000 AND y <= 2009) AS p2000,
avgIfOrNull(price, y >= 2010 AND y <= 2019) AS p2010,
avgIfOrNull(price, y >= 2020) AS p2020
FROM uk_price_paid
WHERE y >= 2000
GROUP BY county)
┌─arrayJoin(corrMatrix(p2000, p2010, p2020))──────────────────┐
│ [1, 0.7843206077587307, 0.913951646326657] │
│ [0.7843206077587307, 1, 0.787124002985455] │
│ [0.913951646326657, 0.7871240029854553, 1] │
└─────────────────────────────────────────────────────────────┘
Developer: FFFFFFFHHHHHHH from Tencent. Bonus: covarPopMatrix, covarSampMatrix.
SELECT generateULID() FROM numbers(10)
┌─generateULID()─────────────┐
│ 01GSYTSHSTNN3AWR9W02WQVTFP │
│ 01GSYTSHSTNN3AWR9W02WQVTFQ │
│ 01GSYTSHSTNN3AWR9W02WQVTFR │
│ 01GSYTSHSTNN3AWR9W02WQVTFS │
│ 01GSYTSHSTNN3AWR9W02WQVTFT │
│ 01GSYTSHSTNN3AWR9W02WQVTFV │
│ 01GSYTSHSTNN3AWR9W02WQVTFW │
│ 01GSYTSHSTNN3AWR9W02WQVTFX │
│ 01GSYTSHSTNN3AWR9W02WQVTFY │
│ 01GSYTSHSTNN3AWR9W02WQVTFZ │
└────────────────────────────┘
^^^^^^^ - monotonic timestamp
^^^^^^^^^^^^^^^^^^^ - random
^ - sequential if generated in a short time
128 bit, base32
Developer: Nikolai Degterinskiy.
:) SELECT arrayShuffle(
tokens('ClickHouse is the Open Source OLAP database management system'))
┌─x─────────────────────────────────────────────────────────────────────────────────┐
│ ['database','is','ClickHouse','management','Open','the','OLAP','system','Source'] │
└───────────────────────────────────────────────────────────────────────────────────┘
:) SELECT arrayResize(arrayPartialSort(x -> lower(x), 3,
tokens('ClickHouse is the Open Source OLAP database management system'))), 3)
┌─x──────────────────────────────┐
│ ['ClickHouse','database','is'] │
└────────────────────────────────┘
Developer: Joanna Hulboj. Bonus: arrayPartialShuffle, arrayPartialReverseSort.
:) SET final = 1;
Equivalent specifying FINAL for every table in the SELECT query:
:) SELECT ... FROM table FINAL
Allows "merge-on-read" logic for data updates.
Applicable mainly for ReplacingMergeTree.
Developer: Arthur Passos.
SELECT PE.Names, PE.Values
FROM system.part_log
ARRAY JOIN ProfileEvents AS PE
WHERE table = 'uk_price_paid' AND event_type = 'MergeParts'
┌─PE.Names────────────────────────────────┬──PE.Values─┐
│ FileOpen │ 194 │
│ ReadBufferFromFileDescriptorRead │ 320 │
│ ReadBufferFromFileDescriptorReadBytes │ 216292400 │
│ WriteBufferFromFileDescriptorWrite │ 608 │
│ WriteBufferFromFileDescriptorWriteBytes │ 179080584 │
│ ReadCompressedBytes │ 216292400 │
│ CompressedReadBufferBlocks │ 11893 │
│ CompressedReadBufferBytes │ 834374886 │
│ OpenedFileCacheMisses │ 147 │
│ IOBufferAllocs │ 470 │
│ IOBufferAllocBytes │ 119823558 │
│ CreatedReadBufferOrdinary │ 147 │
│ DiskReadElapsedMicroseconds │ 137308 │
│ DiskWriteElapsedMicroseconds │ 127250 │
│ MergedRows │ 27910954 │
│ MergedUncompressedBytes │ 2692905763 │
│ MergesTimeMilliseconds │ 14704 │
│ ContextLock │ 314 │
│ QueryProfilerRuns │ 33 │
└─────────────────────────────────────────┴────────────┘
Developer: Bharat Nallan.
If a table is partitioned by aggregation key,
avoid merging the state between partitions.
Demo.
Developer: Nickita Taranov.
Allows to iteratively apply filters while reading.
Demo.
Developer: Alexander Gololobov.
A magic pill to make IO less slow.
Userspace Ring buffer for Input and Output
— Linux kernel developers like using short cryptic names.
Demo.
Developer: Saulius Valatka.
Now we support all three indistinguishable technologies:
Apache Hudi, Apache Delta Lake, Apache Iceberg
SELECT * FROM iceberg(
'https://bucket.s3.amazonaws.com/test_table',
s3 credentials...)
CREATE TABLE t ENGINE = Iceberg(
'https://bucket.s3.amazonaws.com/test_table',
s3 credentials...)
SELECT * FROM iceberg(named_conf
'https://bucket.s3.amazonaws.com/test_table')
Developer: ucasFL.
Now ClickHouse supports Apache Hudi, Delta Lake, Apache Iceberg
for SELECT queries.
— ClickHouse integrates with everything!
— s3, hdfs, https, mysql, postgres, sqlite, mongodb, jdbc, odbc...
— query the data without loading!
Advantages:
— these formats are somewhat resembling MergeTree
allowing incremental data insertion,
approaching to ClickHouse data formats;
Disadvantages:
— a band-aid solution to organize the data;
How to prepare data:
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
mkdir -p ~/hadoop/spark-3.3.2/
tar -xzf spark-3.3.2-bin-hadoop3.tgz -C ~/hadoop/spark-3.3.2/ --strip 1
echo "spark.driver.host localhost" > $SPARK_HOME/conf/spark-defaults.conf
sudo apt install default-jre
export SPARK_HOME=~/hadoop/spark-3.3.2
export JAVA_HOME=/usr/lib/jvm/default-java
cd $SPARK_HOME
bin/spark-shell --packages io.delta:delta-core_2.12:2.1.1,org.apache.hadoop:hadoop-aws:3.3.2 \
--packages org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0 \
--packages org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:0.14.0 \
--driver-memory 100g
spark.read.load("/home/ubuntu/hits.parquet").write.format("delta").mode("overwrite").save("file:///home/ubuntu/delta")
spark.read.load("/home/ubuntu/hits.parquet").write.format("hudi").mode("overwrite").save("file:///home/ubuntu/hudi")
spark.read.load("/home/ubuntu/hits.parquet").write.format("iceberg").mode("overwrite").save("file:///home/ubuntu/iceberg")
It did not work. Random Java errors. I don't know Java :( ChatGPT did not help me.
Was:
storage_configuration:
disks:
web:
type: web
endpoint: 'https://clickhouse-public-datasets.s3.amazonaws.com/web/'
policies:
web:
volumes:
main:
disk: web
ATTACH TABLE hits ...
ENGINE = MergeTree ORDER BY CounterID
SETTINGS storage_policy = 'web'
Slightly better:
storage_configuration:
disks:
web:
type: web
endpoint: 'https://clickhouse-public-datasets.s3.amazonaws.com/web/'
ATTACH TABLE hits ...
ENGINE = MergeTree ORDER BY CounterID
SETTINGS disk = 'web'
No need for "storage policy" in simple cases.
Much better:
ATTACH TABLE hits ...
ENGINE = MergeTree ORDER BY CounterID
SETTINGS disk = disk(
type = 'web',
endpoint = 'https://clickhouse-public-datasets.s3.amazonaws.com/web/')
100% dynamic configuration, no need to touch the configuration files.
Developers: Ksenia Sumarokova.
ATTACH TABLE github_events UUID '127f4241-4a9b-4ecd-8a84-846b88069cb5'
(
`file_time` DateTime,
`event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
`actor_login` LowCardinality(String),
`repo_name` LowCardinality(String),
`created_at` DateTime,
`updated_at` DateTime,
`action` Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
`comment_id` UInt64,
`body` String,
`path` String,
`position` Int32,
`line` Int32,
`ref` LowCardinality(String),
`ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
`creator_user_login` LowCardinality(String),
`number` UInt32,
`title` String,
`labels` Array(LowCardinality(String)),
`state` Enum8('none' = 0, 'open' = 1, 'closed' = 2),
`locked` UInt8,
`assignee` LowCardinality(String),
`assignees` Array(LowCardinality(String)),
`comments` UInt32,
`author_association` Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
`closed_at` DateTime,
`merged_at` DateTime,
`merge_commit_sha` String,
`requested_reviewers` Array(LowCardinality(String)),
`requested_teams` Array(LowCardinality(String)),
`head_ref` LowCardinality(String),
`head_sha` String,
`base_ref` LowCardinality(String),
`base_sha` String,
`merged` UInt8,
`mergeable` UInt8,
`rebaseable` UInt8,
`mergeable_state` Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
`merged_by` LowCardinality(String),
`review_comments` UInt32,
`maintainer_can_modify` UInt8,
`commits` UInt32,
`additions` UInt32,
`deletions` UInt32,
`changed_files` UInt32,
`diff_hunk` String,
`original_position` UInt32,
`commit_id` String,
`original_commit_id` String,
`push_size` UInt32,
`push_distinct_size` UInt32,
`member_login` LowCardinality(String),
`release_tag_name` String,
`release_name` String,
`review_state` Enum8('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
)
ENGINE = MergeTree
ORDER BY (event_type, repo_name, created_at)
SETTINGS disk = disk(type = 'web', endpoint = 'https://clickhouse-public-datasets.s3.amazonaws.com/web/');
SELECT body FROM github_events WHERE actor_login = 'alexey-milovidov' AND event_type = 'IssueCommentEvent' LIMIT 10;
Metabase:
— the first stable release 1.0.0;
— all databases are visible;
— available in Metabase Cloud;
Kafka Connect:
— support for Amazon MSK;
Updates for Python, Node.js, Go drivers and dbt.
— free 30-day trial with $300 credits up to 10 TB of data;
— affordable clusters for developers with full HA < $100/month
Try it! https://clickhouse.cloud/.