In [1]:

require 'daru'

df = Daru::DataFrame.new({a: [1,2,3,4,5], b: [10,14,15,17,44]})
df.plot legends: [:a, :b], type: :line do |p,d|
  p.yrange [0,100]
  p.legend true
  d.color "green"
end

Out[1]:

In [2]:

require 'daru'
# Calculate statistics of numeric columns
df = Daru::DataFrame.new({
      a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'], 
      b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
      c: ['small','large','large','small','small','large','small','large','small'],
      d: [1,2,2,3,3,4,5,6,7],
      e: [2,4,4,6,6,8,10,12,14],
      f: [10,20,20,30,30,40,50,60,70]
    })
df.mean

Out[2]:

	nil
d	3.6666666666666665
e	7.333333333333333
f	36.666666666666664

In [3]:

# Calculate multiple statistical measures in one shot
df.describe

Out[3]:

	d	e	f
count	9	9	9
mean	3.6666666666666665	7.333333333333333	36.666666666666664
std	2.0	4.0	20.0
min	1	2	10
max	7	14	70

In [4]:

# Create a multi-indexed DataFrame
tuples = [
  [:a,:one,:bar],
  [:a,:one,:baz],
  [:a,:two,:bar],
  [:a,:two,:baz],
  [:b,:one,:bar],
  [:b,:two,:bar],
  [:b,:two,:baz],
  [:b,:one,:foo],
  [:c,:one,:bar],
  [:c,:one,:baz],
  [:c,:two,:foo],
  [:c,:two,:bar]
]
multi_index = Daru::MultiIndex.new(tuples)

vector_arry1 = [11,12,13,14,11,12,13,14,11,12,13,14]
vector_arry2 = [1,2,3,4,1,2,3,4,1,2,3,4]

order_mi = Daru::MultiIndex.new([
  [:a,:one,:bar],
  [:a,:two,:baz],
  [:b,:two,:foo],
  [:b,:one,:foo]])

df_mi = Daru::DataFrame.new([
  vector_arry1, 
  vector_arry2, 
  vector_arry1, 
  vector_arry2], order: order_mi, index: multi_index)

Out[4]:

	[:a, :one, :bar]	[:a, :two, :baz]	[:b, :two, :foo]	[:b, :one, :foo]
[:a, :one, :bar]	11	1	11	1
[:a, :one, :baz]	12	2	12	2
[:a, :two, :bar]	13	3	13	3
[:a, :two, :baz]	14	4	14	4
[:b, :one, :bar]	11	1	11	1
[:b, :two, :bar]	12	2	12	2
[:b, :two, :baz]	13	3	13	3
[:b, :one, :foo]	14	4	14	4
[:c, :one, :bar]	11	1	11	1
[:c, :one, :baz]	12	2	12	2
[:c, :two, :foo]	13	3	13	3
[:c, :two, :bar]	14	4	14	4

In [5]:

# Specify complete tuple to choose a single row
df_mi.row[:a, :one,:bar]

Out[5]:

	0
[:a, :one, :bar]	11
[:a, :two, :baz]	1
[:b, :two, :foo]	11
[:b, :one, :foo]	1

In [6]:

# Specify partial tuple to select index hierarchially
df_mi.row[:a]

Out[6]:

	[:a, :one, :bar]	[:a, :two, :baz]	[:b, :two, :foo]	[:b, :one, :foo]
[:one, :bar]	11	1	11	1
[:one, :baz]	12	2	12	2
[:two, :bar]	13	3	13	3
[:two, :baz]	14	4	14	4

In [7]:

# See grouped rows with the 'groups' method

df = Daru::DataFrame.new({
  a: %w{foo bar foo bar   foo bar foo foo},
  b: %w{one one two three two two one three},
  c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
  d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
})
grouped = df.group_by([:a, :b])
grouped.groups

Out[7]:

{["bar", "one"]=>[1], ["bar", "three"]=>[3], ["bar", "two"]=>[5], ["foo", "one"]=>[0, 6], ["foo", "three"]=>[7], ["foo", "two"]=>[2, 4]}

In [8]:

# First group by the columns :a and :b and then calculate mean of the grouped rows.
grouped.mean

Out[8]:

	c	d
[:bar, :one]	2	22
[:bar, :three]	1	44
[:bar, :two]	6	66
[:foo, :one]	2.0	44.0
[:foo, :three]	8	88
[:foo, :two]	3.0	44.0

In [9]:

grouped.get_group(["foo", "one"])

Out[9]:

	a	b	c	d
0	foo	one	1	11
6	foo	one	3	77

In [10]:

require 'daru'
sales = Daru::DataFrame.from_csv '/home/sameer/sales-funnel.csv'

Out[10]:

	account	manager	name	price	product	quantity	rep	status
0	714466	Debra Henley	Trantow-Barrows	30000	CPU	1	Craig Booker	presented
1	714466	Debra Henley	Trantow-Barrows	10000	Software	1	Craig Booker	presented
2	714466	Debra Henley	Trantow-Barrows	5000	Maintenance	2	Craig Booker	pending
3	737550	Debra Henley	Fritsch, Russel and Anderson	35000	CPU	1	Craig Booker	declined
4	146832	Debra Henley	Kiehn-Spinka	65000	CPU	2	Daniel Hilton	won
5	218895	Debra Henley	Kulas Inc	40000	CPU	2	Daniel Hilton	pending
6	218895	Debra Henley	Kulas Inc	10000	Software	1	Daniel Hilton	presented
7	412290	Debra Henley	Jerde-Hilpert	5000	Maintenance	2	John Smith	pending
8	740150	Debra Henley	Barton LLC	35000	CPU	1	John Smith	declined
9	141962	Fred Anderson	Herman LLC	65000	CPU	2	Cedric Moss	won
10	163416	Fred Anderson	Purdy-Kunde	30000	CPU	1	Cedric Moss	presented
11	239344	Fred Anderson	Stokes LLC	5000	Maintenance	1	Cedric Moss	pending
12	239344	Fred Anderson	Stokes LLC	10000	Software	1	Cedric Moss	presented
13	307599	Fred Anderson	Kassulke, Ondricka and Metz	7000	Maintenance	3	Wendy Yule	won
14	688981	Fred Anderson	Keeling LLC	100000	CPU	5	Wendy Yule	won
15	729833	Fred Anderson	Koepp Ltd	65000	CPU	2	Wendy Yule	declined
16	729833	Fred Anderson	Koepp Ltd	5000	Monitor	2	Wendy Yule	presented

In [11]:

sales.pivot_table index: [:manager, :rep]

Out[11]:

	account	price	quantity
[:"Debra Henley", :"Craig Booker"]	720237.0	20000.0	1.25
[:"Debra Henley", :"Daniel Hilton"]	194874.0	38333.333333333336	1.6666666666666667
[:"Debra Henley", :"John Smith"]	576220.0	20000.0	1.5
[:"Fred Anderson", :"Cedric Moss"]	196016.5	27500.0	1.25
[:"Fred Anderson", :"Wendy Yule"]	614061.5	44250.0	3.0

In [12]:

sales.pivot_table(index: [:manager,:rep], values: :price,vectors: [:product], agg: :sum)

Out[12]:

	[:price, :CPU]	[:price, :Software]	[:price, :Maintenance]	[:price, :Monitor]
[:"Debra Henley", :"Craig Booker"]	65000	10000	5000
[:"Debra Henley", :"Daniel Hilton"]	105000	10000
[:"Debra Henley", :"John Smith"]	35000		5000
[:"Fred Anderson", :"Cedric Moss"]	95000	10000	5000
[:"Fred Anderson", :"Wendy Yule"]	165000		7000	5000

In [13]:

df = Daru::DataFrame.new({
        a: ['ff'  ,  'fwwq',  'efe',  'a',  'efef',  'zzzz',  'efgg',  'q',  'ggf'], 
        b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
        c: ['small','large','large','small','small','large','small','large','small'],
        d: [-1,2,-2,3,-3,4,-5,6,7],
        e: [2,4,4,6,6,8,10,12,14]
      })
 df.sort([:a,:d], by: {a: lambda {|a,b| a.length <=> b.length }, b: lambda {|a,b| a.abs <=> b.abs }}, ascending: [false, true])

Out[13]:

	a	b	c	d	e
6	efgg	one	small	-5	10
4	efef	two	small	-3	6
1	fwwq	one	large	2	4
5	zzzz	one	large	4	8
2	efe	one	large	-2	4
8	ggf	two	small	7	14
0	ff	one	small	-1	2
3	a	two	small	3	6
7	q	two	large	6	12