On Github jasonaowen / pdxpug-incremental-schema
or
by
@jasonaowen
https://jasonaowen.github.io/pdxpug-incremental-schema https://github.com/jasonaowen/pdxpug-incremental-schema
https://www.githubarchive.org/
wget + cron
Gzipped, Line-Delimited JSON
February 18th, 5pm Pacific time: http://data.githubarchive.org/2016-02-19-01.json.gz
Python is pretty cool
CREATE TABLE raw_events ( id SERIAL PRIMARY KEY, event JSONB NOT NULL )
Unicode allows escaped 0 bytes, but if you try to look at a JSON object with one:
unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. CONTEXT: JSON data, line 1: ...yload": {"action": "created", "comment": {"body":...
The jsonb type also rejects \u0000 (because that cannot be represented in PostgreSQL's text type)
http://www.postgresql.org/docs/9.5/static/datatype-json.html
zcat 2016-02-19-01.json.gz | grep '\\u0000' | wc -l
The double backslash matters.
other approaches are valid, here; you could halt the import, or substitute a placeholder using sed.zcat 2016-02-19-01.json.gz | grep -v '\\u0000' | python json-to-postgres.py owenja pdxpug raw_events event
SELECT nspname || '.' || relname AS "relation", pg_size_pretty(pg_relation_size(C.oid)) AS "size" FROM pg_class C LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') ORDER BY pg_relation_size(C.oid) DESC;will someone write this down?
CREATE TABLE events ( id INTEGER PRIMARY KEY REFERENCES raw_events(id), event JSONB NOT NULL ); INSERT INTO events (id, event) SELECT id, event FROM raw_events;
Scalars:
Containers:
{ "created_at": "2011-09-06T17:26:27Z", "id": 12345 }keys and values. Keys are always strings; values can be any JSON type. Keys may not be unique!
SELECT jsonb_typeof(event), COUNT(*) FROM events GROUP BY jsonb_typeof(event);
CREATE FUNCTION type_count(in table_name text, in column_name text) RETURNS TABLE (json_type text, count bigint) AS $func$ BEGIN RETURN QUERY EXECUTE format( 'SELECT jsonb_typeof("%s"), COUNT(*) FROM "%s" GROUP BY jsonb_typeof("%s")', column_name, table_name, column_name); END $func$ LANGUAGE plpgsql STABLE; select * from type_count('events', 'event');
SELECT jsonb_object_keys(event), COUNT(*) FROM events GROUP BY jsonb_object_keys(event);
CREATE FUNCTION key_count(in table_name text, in column_name text) RETURNS TABLE (key text, count bigint) AS $func$ BEGIN RETURN QUERY EXECUTE format( 'SELECT jsonb_object_keys("%s"), COUNT(*) FROM "%s" GROUP BY jsonb_object_keys("%s")', column_name, table_name, column_name); END $func$ LANGUAGE plpgsql STABLE; select * from key_count('events', 'event');
Now that we know what's in those objects, let's get them into columns.
ALTER TABLE events ADD COLUMN actor JSONB, ADD COLUMN created_at JSONB, ADD COLUMN github_id JSONB, ADD COLUMN org JSONB, ADD COLUMN payload JSONB, ADD COLUMN public JSONB, ADD COLUMN repo JSONB, ADD COLUMN type JSONB;
UPDATE events SET actor = event->'actor', created_at = event->'created_at', github_id = event->'github_id', org = event->'org', payload = event->'payload', public = event->'public', repo = event->'repo', type = event->'type';
CREATE FUNCTION explode_json_column( in table_name text, in column_name text ) RETURNS void AS $func$ DECLARE record RECORD; BEGIN FOR record IN SELECT key, count FROM key_count(table_name, column_name) LOOP EXECUTE format('ALTER TABLE "%s" ADD COLUMN "%s" JSONB', table_name, record.key); EXECUTE format('UPDATE "%s" SET "%s" = "%s"->''%s''', table_name, record.key, column_name, record.key); END LOOP; END $func$ LANGUAGE plpgsql;
PostgreSQL seems not to be able to automatically, implicitly convert between a JSON[B] type and another scalar.
SELECT * FROM type_count('events', 'created_at'); SELECT created_at FROM events LIMIT 5; ALTER TABLE events ALTER COLUMN created_at TYPE TIMESTAMP USING created_at::TEXT::TIMESTAMP;
SELECT * FROM type_count('events', 'public'); SELECT public FROM events LIMIT 5; ALTER TABLE events ALTER COLUMN public TYPE BOOLEAN USING public::TEXT::BOOLEAN;the public field in this dataset is always a True value; probably you could drop it.
SELECT * FROM type_count('events', 'type'); SELECT type FROM events LIMIT 5; ALTER TABLE events ALTER COLUMN type TYPE TEXT USING type::TEXT::TEXT;
SELECT type FROM events LIMIT 5; UPDATE events SET type = TRIM(BOTH '"' FROM type);
CREATE FUNCTION set_concrete_type( in table_name text, in column_name text, in type_name text ) RETURNS void AS $func$ BEGIN EXECUTE format( 'ALTER TABLE "%s" ALTER COLUMN "%s" TYPE %s USING TRIM(BOTH ''"'' FROM "%s"::TEXT)::%s', table_name, column_name, type_name, column_name, type_name); END $func$ LANGUAGE plpgsql;
SELECT * FROM type_count('events', 'github_id'); SELECT github_id FROM events LIMIT 5; SELECT set_concrete_type('events', 'github_id', 'BIGINT');
Let's normalize a little.
SELECT * FROM type_count('events', 'actor'); SELECT COUNT(DISTINCT actor) FROM events; SELECT * FROM key_count('events', 'actor');
CREATE TABLE actors ( actor_id SERIAL PRIMARY KEY, payload JSONB UNIQUE NOT NULL ); INSERT INTO actors (payload) SELECT DISTINCT actor FROM events;
ALTER TABLE events ADD COLUMN actor_id INTEGER REFERENCES actors (actor_id); UPDATE events SET actor_id = (SELECT actor_id FROM actors WHERE actor = payload);without that UNIQUE constraint, this update would have taken far longer.
ALTER TABLE events ALTER COLUMN actor_id SET NOT NULL, DROP COLUMN actor;
SELECT explode_json_column('actors', 'payload'); SELECT * FROM type_count('actors', 'avatar_url'); SELECT set_concrete_type('actors', 'avatar_url', 'TEXT'); SELECT * FROM type_count('actors', 'id'); SELECT set_concrete_type('actors', 'id', 'BIGINT');
SELECT * FROM type_count('actors', 'login'); SELECT set_concrete_type('actors', 'login', 'TEXT'); SELECT * FROM type_count('actors', 'url'); SELECT set_concrete_type('actors', 'url', 'TEXT');
SELECT * FROM type_count('actors', 'gravatar_id'); SELECT DISTINCT gravatar_id FROM actors; ALTER TABLE actors DROP COLUMN gravatar_id;
If it's always empty, do we still need it?
ALTER TABLE actors DROP COLUMN payload;
The payload type is a little different.
SELECT * FROM key_count('events', 'payload');
SELECT type, jsonb_object_keys(event), COUNT(*) FROM events GROUP BY type, jsonb_object_keys(event)
CREATE TABLE pushes ( id SERIAL PRIMARY KEY, event_id INTEGER NOT NULL REFERENCES events(id), payload JSONB NOT NULL); INSERT INTO pushes (event_id, payload) SELECT id, payload FROM events WHERE type = 'PushEvent';
Note that the foreign key points the other way!
SELECT * FROM key_count('pushes', 'payload'); SELECT explode_json_column('pushes', 'payload'); SELECT set_concrete_type('pushes', 'before', 'TEXT'); SELECT set_concrete_type('pushes', 'distinct_size', 'INTEGER'); SELECT set_concrete_type('pushes', 'head', 'TEXT'); SELECT set_concrete_type('pushes', 'push_id', 'BIGINT'); SELECT set_concrete_type('pushes', 'ref', 'TEXT'); SELECT set_concrete_type('pushes', 'size', 'INTEGER');
SELECT * FROM type_count('pushes', 'commits'); SELECT commits->0 FROM pushes WHERE id = 1; SELECT jsonb_array_elements(commits) FROM pushes WHERE id = 1;
CREATE TABLE push_commits ( id SERIAL PRIMARY KEY, push_id INTEGER NOT NULL REFERENCES PUSHES(id), commit JSONB NOT NULL ); INSERT INTO push_commits (push_id, commit) SELECT id, jsonb_array_elements(commits) FROM pushes;
CREATE TABLE commits ( id SERIAL PRIMARY KEY, payload JSONB NOT NULL ); INSERT INTO commits (payload) SELECT DISTINCT commit FROM push_commits;
ALTER TABLE push_commits ADD COLUMN commit_id INTEGER REFERENCES commits (id); CREATE UNIQUE INDEX ON commits (payload); CREATE UNIQUE INDEX ON commits (MD5(payload::TEXT)); CREATE INDEX ON push_commits (MD5(commit::TEXT));we don't need a unique index on push_commits because the same commit can be in multiple pushes.
UPDATE push_commits SET commit_id = (SELECT id FROM commits WHERE MD5(commit::TEXT) = MD5(payload::TEXT)); ALTER TABLE push_commits ALTER COLUMN commit_id SET NOT NULL, DROP COLUMN commit, DROP COLUMN id, ADD PRIMARY KEY (push_id, commit_id);
SELECT * FROM type_count('commits', 'payload'); SELECT * FROM key_count('commits', 'payload'); SELECT explode_json_column('commits', 'payload'); SELECT set_concrete_type('commits', 'distinct', 'BOOLEAN'); SELECT set_concrete_type('commits', 'message', 'TEXT'); SELECT set_concrete_type('commits', 'sha', 'TEXT'); SELECT set_concrete_type('commits', 'url', 'TEXT');
We could extract the authors table here.
SELECT * FROM type_count('commits', 'author'); SELECT * FROM key_count('commits', 'author');
But we won't, as it doesn't cover anything new.
ALTER TABLE commits DROP COLUMN payload; ALTER TABLE pushes DROP COLUMN payload, DROP COLUMN commits; UPDATE events SET payload = NULL WHERE type = 'PushEvent';
SELECT nspname || '.' || relname AS "relation", pg_size_pretty(pg_relation_size(C.oid)) AS "size" FROM pg_class C LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') ORDER BY pg_relation_size(C.oid) DESC;
This is a lot of data.
And that's just for an hour!
https://bigquery.cloud.google.com/table/githubarchive:day.yesterday
@jasonaowen
https://jasonaowen.github.io/pdxpug-incremental-schema https://github.com/jasonaowen/pdxpug-incremental-schema