require 'daru'
df = Daru::DataFrame.new({a: [1,2,3,4,5], b: [10,14,15,17,44]})
df.plot legends: [:a, :b], type: :line do |p,d|
p.yrange [0,100]
p.legend true
d.color "green"
end
require 'daru'
# Calculate statistics of numeric columns
df = Daru::DataFrame.new({
a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
c: ['small','large','large','small','small','large','small','large','small'],
d: [1,2,2,3,3,4,5,6,7],
e: [2,4,4,6,6,8,10,12,14],
f: [10,20,20,30,30,40,50,60,70]
})
df.mean
nil | |
---|---|
d | 3.6666666666666665 |
e | 7.333333333333333 |
f | 36.666666666666664 |
# Calculate multiple statistical measures in one shot
df.describe
d | e | f | |
---|---|---|---|
count | 9 | 9 | 9 |
mean | 3.6666666666666665 | 7.333333333333333 | 36.666666666666664 |
std | 2.0 | 4.0 | 20.0 |
min | 1 | 2 | 10 |
max | 7 | 14 | 70 |
# Create a multi-indexed DataFrame
tuples = [
[:a,:one,:bar],
[:a,:one,:baz],
[:a,:two,:bar],
[:a,:two,:baz],
[:b,:one,:bar],
[:b,:two,:bar],
[:b,:two,:baz],
[:b,:one,:foo],
[:c,:one,:bar],
[:c,:one,:baz],
[:c,:two,:foo],
[:c,:two,:bar]
]
multi_index = Daru::MultiIndex.new(tuples)
vector_arry1 = [11,12,13,14,11,12,13,14,11,12,13,14]
vector_arry2 = [1,2,3,4,1,2,3,4,1,2,3,4]
order_mi = Daru::MultiIndex.new([
[:a,:one,:bar],
[:a,:two,:baz],
[:b,:two,:foo],
[:b,:one,:foo]])
df_mi = Daru::DataFrame.new([
vector_arry1,
vector_arry2,
vector_arry1,
vector_arry2], order: order_mi, index: multi_index)
[:a, :one, :bar] | [:a, :two, :baz] | [:b, :two, :foo] | [:b, :one, :foo] | |
---|---|---|---|---|
[:a, :one, :bar] | 11 | 1 | 11 | 1 |
[:a, :one, :baz] | 12 | 2 | 12 | 2 |
[:a, :two, :bar] | 13 | 3 | 13 | 3 |
[:a, :two, :baz] | 14 | 4 | 14 | 4 |
[:b, :one, :bar] | 11 | 1 | 11 | 1 |
[:b, :two, :bar] | 12 | 2 | 12 | 2 |
[:b, :two, :baz] | 13 | 3 | 13 | 3 |
[:b, :one, :foo] | 14 | 4 | 14 | 4 |
[:c, :one, :bar] | 11 | 1 | 11 | 1 |
[:c, :one, :baz] | 12 | 2 | 12 | 2 |
[:c, :two, :foo] | 13 | 3 | 13 | 3 |
[:c, :two, :bar] | 14 | 4 | 14 | 4 |
# Specify complete tuple to choose a single row
df_mi.row[:a, :one,:bar]
0 | |
---|---|
[:a, :one, :bar] | 11 |
[:a, :two, :baz] | 1 |
[:b, :two, :foo] | 11 |
[:b, :one, :foo] | 1 |
# Specify partial tuple to select index hierarchially
df_mi.row[:a]
[:a, :one, :bar] | [:a, :two, :baz] | [:b, :two, :foo] | [:b, :one, :foo] | |
---|---|---|---|---|
[:one, :bar] | 11 | 1 | 11 | 1 |
[:one, :baz] | 12 | 2 | 12 | 2 |
[:two, :bar] | 13 | 3 | 13 | 3 |
[:two, :baz] | 14 | 4 | 14 | 4 |
# See grouped rows with the 'groups' method
df = Daru::DataFrame.new({
a: %w{foo bar foo bar foo bar foo foo},
b: %w{one one two three two two one three},
c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
})
grouped = df.group_by([:a, :b])
grouped.groups
{["bar", "one"]=>[1], ["bar", "three"]=>[3], ["bar", "two"]=>[5], ["foo", "one"]=>[0, 6], ["foo", "three"]=>[7], ["foo", "two"]=>[2, 4]}
# First group by the columns :a and :b and then calculate mean of the grouped rows.
grouped.mean
c | d | |
---|---|---|
[:bar, :one] | 2 | 22 |
[:bar, :three] | 1 | 44 |
[:bar, :two] | 6 | 66 |
[:foo, :one] | 2.0 | 44.0 |
[:foo, :three] | 8 | 88 |
[:foo, :two] | 3.0 | 44.0 |
grouped.get_group(["foo", "one"])
a | b | c | d | |
---|---|---|---|---|
0 | foo | one | 1 | 11 |
6 | foo | one | 3 | 77 |
require 'daru'
sales = Daru::DataFrame.from_csv '/home/sameer/sales-funnel.csv'
account | manager | name | price | product | quantity | rep | status | |
---|---|---|---|---|---|---|---|---|
0 | 714466 | Debra Henley | Trantow-Barrows | 30000 | CPU | 1 | Craig Booker | presented |
1 | 714466 | Debra Henley | Trantow-Barrows | 10000 | Software | 1 | Craig Booker | presented |
2 | 714466 | Debra Henley | Trantow-Barrows | 5000 | Maintenance | 2 | Craig Booker | pending |
3 | 737550 | Debra Henley | Fritsch, Russel and Anderson | 35000 | CPU | 1 | Craig Booker | declined |
4 | 146832 | Debra Henley | Kiehn-Spinka | 65000 | CPU | 2 | Daniel Hilton | won |
5 | 218895 | Debra Henley | Kulas Inc | 40000 | CPU | 2 | Daniel Hilton | pending |
6 | 218895 | Debra Henley | Kulas Inc | 10000 | Software | 1 | Daniel Hilton | presented |
7 | 412290 | Debra Henley | Jerde-Hilpert | 5000 | Maintenance | 2 | John Smith | pending |
8 | 740150 | Debra Henley | Barton LLC | 35000 | CPU | 1 | John Smith | declined |
9 | 141962 | Fred Anderson | Herman LLC | 65000 | CPU | 2 | Cedric Moss | won |
10 | 163416 | Fred Anderson | Purdy-Kunde | 30000 | CPU | 1 | Cedric Moss | presented |
11 | 239344 | Fred Anderson | Stokes LLC | 5000 | Maintenance | 1 | Cedric Moss | pending |
12 | 239344 | Fred Anderson | Stokes LLC | 10000 | Software | 1 | Cedric Moss | presented |
13 | 307599 | Fred Anderson | Kassulke, Ondricka and Metz | 7000 | Maintenance | 3 | Wendy Yule | won |
14 | 688981 | Fred Anderson | Keeling LLC | 100000 | CPU | 5 | Wendy Yule | won |
15 | 729833 | Fred Anderson | Koepp Ltd | 65000 | CPU | 2 | Wendy Yule | declined |
16 | 729833 | Fred Anderson | Koepp Ltd | 5000 | Monitor | 2 | Wendy Yule | presented |
sales.pivot_table index: [:manager, :rep]
account | price | quantity | |
---|---|---|---|
[:"Debra Henley", :"Craig Booker"] | 720237.0 | 20000.0 | 1.25 |
[:"Debra Henley", :"Daniel Hilton"] | 194874.0 | 38333.333333333336 | 1.6666666666666667 |
[:"Debra Henley", :"John Smith"] | 576220.0 | 20000.0 | 1.5 |
[:"Fred Anderson", :"Cedric Moss"] | 196016.5 | 27500.0 | 1.25 |
[:"Fred Anderson", :"Wendy Yule"] | 614061.5 | 44250.0 | 3.0 |
sales.pivot_table(index: [:manager,:rep], values: :price,vectors: [:product], agg: :sum)
[:price, :CPU] | [:price, :Software] | [:price, :Maintenance] | [:price, :Monitor] | |
---|---|---|---|---|
[:"Debra Henley", :"Craig Booker"] | 65000 | 10000 | 5000 | |
[:"Debra Henley", :"Daniel Hilton"] | 105000 | 10000 | ||
[:"Debra Henley", :"John Smith"] | 35000 | 5000 | ||
[:"Fred Anderson", :"Cedric Moss"] | 95000 | 10000 | 5000 | |
[:"Fred Anderson", :"Wendy Yule"] | 165000 | 7000 | 5000 |
df = Daru::DataFrame.new({
a: ['ff' , 'fwwq', 'efe', 'a', 'efef', 'zzzz', 'efgg', 'q', 'ggf'],
b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
c: ['small','large','large','small','small','large','small','large','small'],
d: [-1,2,-2,3,-3,4,-5,6,7],
e: [2,4,4,6,6,8,10,12,14]
})
df.sort([:a,:d], by: {a: lambda {|a,b| a.length <=> b.length }, b: lambda {|a,b| a.abs <=> b.abs }}, ascending: [false, true])
a | b | c | d | e | |
---|---|---|---|---|---|
6 | efgg | one | small | -5 | 10 |
4 | efef | two | small | -3 | 6 |
1 | fwwq | one | large | 2 | 4 |
5 | zzzz | one | large | 4 | 8 |
2 | efe | one | large | -2 | 4 |
8 | ggf | two | small | 7 | 14 |
0 | ff | one | small | -1 | 2 |
3 | a | two | small | 3 | 6 |
7 | q | two | large | 6 | 12 |