On Github kerinin / tech_talk_presentation
Because history
*assuming the process doesn't fail before we can log the message counts
Both providers timestamp in seconds since the epoch, UTC
cat *_foo/bar
cut
cut -f1-3 # => extracts fields 1-3. cut -f5- # => extracts all fields after 4 cut -c10-20 # => extracts characters 10 through 20
sort
sort -t$'\t' -k10 # => sort by the 10th field, with tab-delimited fields sort -k3n -k10 # => primary sort on 3rd field as numeric, secondary on 10 sort -n # => reverse sort
uniq
sort | uniq # => returns unique elements (must be sorted first) sort | uniq -c # => returns unique elements with counts
fgrep '/foobar/' # => treats '/foobar/' as a string, not a regex (faster)
awk 'BEGIN {FS="\t"}; { if( $3 % 10 == 2) print $5 / $4}' sed 's/[[:space:]]+/\ /g'
vimdiff <(sort -k5 foo | cut -f3-) <(sort -k5 bar | cut -f3-) cut -f11 foo | vim - echo "$(wc -l foo) / $(wc -l bar)" | bc
function summation() { paste -sd+ - | bc } echo "1 2 3" | summation # => 6 function compact() { sed '/^$/d' } echo "1\n\n2\n3" | compact # => '1\n2\n3' function quantiles() { R --vanilla --slave -e "options(width = 400) data=scan(pipe('cat /dev/stdin')); quantile(data, seq(0,1,0.1));" } echo "1 2 3 4 5" | summation # => (10 quantiles) function ecdf() { R --vanilla --slave -e "options(width = 400) data=scan(pipe('cat /dev/stdin')); ecdf(data)($1);" } echo "1 2 3 4 5" | ecdf 3 # => 0.6
REGISTER s3://elasticmapreduce/libs/pig/0.9.1/piggybank-0.9.1-amzn.jar; DEFINE HASH org.apache.pig.piggybank.evaluation.string.HashFNV(); with_guid = FOREACH records GENERATE CONCAT((chararray)HASH(subject), CONCAT(user_id, from)) as guid, scanned_at..; sampled = FILTER with_guid BY HASH(guid, 1000) == 0;
%declare EMR_HOST `uname -n`;
%declare INPUT_SCHEMA_C `echo '$INPUT_SCHEMA' | tr '~' ','`; %declare FIELD_COUNT `echo '$INPUT_SCHEMA_C' | awk -F ',' '{ print NF }'`; %declare INDEX_END `echo "$(( $FIELD_COUNT-1 ))"`; data = LOAD '$INPUT_PATH' AS ($INPUT_SCHEMA_C); less_data = FOREACH data GENERATE $4..$$INDEX_END;
%declare BASE_INPUT_C `echo "$BASE_INTPUT" | tr '~' ','`; %declare OTHER_INPUT_C `echo "$OTHER_INTPUT" | tr '~' ','`; REGISTER s3://elasticmapreduce/libs/pig/0.9.1/piggybank-0.9.1-amzn.jar; DEFINE HASH org.apache.pig.piggybank.evaluation.string.HashFNV(); input_b = LOAD '$BASE_INPUT_C' USING TextLoader() AS line; input_o = LOAD '$OTHER_INPUT_C' USING TextLoader() AS line; filtered_b = FILTER input_b BY HASH(SUBSTRING(line, 0, $FIRST_N), $SAMPLE_DENOMINATOR) == 0; filtered_o = FILTER input_o BY HASH(SUBSTRING(line, 0, $FIRST_N), $SAMPLE_DENOMINATOR) == 0; ordered_b = ORDER filtered_b BY *; ordered_o = ORDER filtered_o BY *; STORE ordered_b INTO '$BASE_OUTPUT'; STORE ordered_o INTO '$OTHER_OUTPUT';
b = LOAD '$BASE_C' AS ($INPUT_SCHEMA_C); b_guid_fail = FILTER b BY $0 IS NULL; b_guid_fail = FOREACH b_guid_fail GENERATE $1..; joined = COGROUP b BY (chararray)$0, o BY (chararray)$0; only_b = FILTER joined BY IsEmpty(o); only_b = FOREACH only_b GENERATE FLATTEN(b); only_b = FOREACH only_b GENERATE $1..; both = FILTER joined BY (NOT IsEmpty(b)) AND (NOT IsEmpty(o)); both_singular = FILTER both BY (SIZE(b) == 1) AND (SIZE(o) == 1); both_multiple = FILTER both BY (SIZE(b) != 1) OR (SIZE(o) != 1); both_flattened = FOREACH both_singular GENERATE FLATTEN(b), FLATTEN(o); both_modified_raw = FILTER both_flattened BY $MODIFIED_QUERY_C; both_unmodified_raw = FILTER both_flattened BY $UNMODIFIED_QUERY_C; both_unmodified = FOREACH both_unmodified_raw GENERATE $1..$$BASE_END; both_from_b = FOREACH both_modified_raw GENERATE $1..$$BASE_END; both_modified_b = FOREACH both_multiple GENERATE FLATTEN(b); both_modified_b = FOREACH both_modified_b GENERATE $1..;
plot( log(data$x), log(data$y), ann=F, xlim=c(0,1), ylim=c(0,1), col='blue' ) plot( log(test$x), log(test$y), add=T, col='red' )
hist( data$x, xlim=c(0,10), breaks=quantiles(data$x, seq(0,1,0.1)) )
plot.ecdf( data$x, xlim=c(0,10) )
abline(v=10)
lines( par()$usr[1:2], par()$usr[3:4] )