Big Data Analytics Lab
Big Data Analytics Lab
DEPARTMENT OF
Artificial Intelligence And Data Science
EXPERIMENT- 1
AIM:
1) Implement the following Data structures in Java
a) Linked Lists b) Stacks c) Queues d) set e) map
CODE:
a) Linked Lists
import java.io.*;
import java.util.LinkedList;
import java.util.ListIterator;
public class LLDemo
{
static void insertFirst(LinkedList ll,String a)
{
ll.addFirst(a);
System.out.println(ll);
}
static void insertLast(LinkedList ll,String a)
{
ll.addLast(a);
System.out.println(ll);
}
static void DeleteFirst(LinkedList ll)
{
ll.removeFirst(); System.out.println(ll);
}
static void DeleteLast(LinkedList ll)
{
ll.removeLast(); System.out.println(ll);
}
static void Find(LinkedList ll,String a)
{
int pos=ll.indexOf(a);
if(pos==-1)
System.out.println("\nElement not found");
else
System.out.println("\nElement found at the postion:"+pos);
}
1
lOMoARcPSD|372 403 83
System.out.println(ll);
}
static void RemoveElement(LinkedList ll,String a)
{
int pos=ll.indexOf(a);
ll.remove(a);
System.out.println(ll);
}
public static void main(String arg[]) throws IOException
{
LinkedList<String> ll=new LinkedList<String>();
DataInputStream din=new DataInputStream(System.in);
String st;
while(true)
{
System.out.println("\nMenu:\n1.Insert First\n2.Insert Last\n3.Remove
First\n4.Remove Last\n5.Search For an element\n6.Middle Insert\n7.Remove");
System.out.println("\nEnter your operation:");int
ch=Integer.parseInt(din.readLine());
switch(ch)
{
case 1:
System.out.println("enter a element to insert:");
st=din.readLine();
insertFirst(ll,st);break;
DeleteFirst(ll);
case 3: break;
DeleteLast(ll);
case 4: break;
2
lOMoARcPSD|372 403 83
st=din.readLine();
System.out.println("enter the after element:");
String s=din.readLine();
Minsert(ll,st,s);
break;
case 7:
System.out.println("Enter an element to delete:");
st=din.readLine();
RemoveElement(ll,st);break;
default: System.exit(0);
}
}
}
}
OUTPUT:
javac
LLDemo.javajava
LLDemo
Menu:
1. Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
3
lOMoARcPSD|372 403 83
Menu:
1.Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
Menu:
1.Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
Menu:
1.Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
4
lOMoARcPSD|372 403 83
Menu:
1.Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
Menu:
1. Insert First
2. Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
Inserting at position:0
[9, 5]
Menu:
1.Insert First
2.Insert Last
3. Remove First
5
lOMoARcPSD|372 403 83
4. Remove Last
5. Search For an element
6.Middle Insert
7.Remove
Menu:
1.Insert First
2.Insert Last
3.Remove First
4.Remove Last
5.Search For an element
6.Middle Insert
7.Remove
6
lOMoARcPSD|372 403 83
B) Stacks
import java.util.*;
import java.io.*;
public class StackDemo
{
static void insert(Stack s,int a)
{
s.add(new Integer(a)); System.out.println("Elements
in stack are:"+s);
}
static void delete(Stack s)
{
Integer a=(Integer)s.pop();
System.out.println("Deleted element is "+a);
System.out.println("Remaining elements in stack are:"+s);
}
static void first(Stack s)
{
Integer a=(Integer)s.peek();
System.out.println("the first element in stack is "+a);
}
public static void main(String a[]) throws IOException
{
Stack s=new Stack();
DataInputStream d=new DataInputStream(System.in);
while(true)
{
System.out.println("Menu\n1.Insert\n2.Delete\n3.First
Element\n4.Exit");
System.out.println("Enter your choice:");
int ch=Integer.parseInt(d.readLine());
switch(ch)
{
case 1:
System.out.println("Enter the element to insert:");int
ele=Integer.parseInt(d.readLine()); insert(s,ele);
break;
case 2: delete(s);
break;
case 3: first(s);
break;
case 4: System.exit(0);
7
lOMoARcPSD|372 403 83
default:
System.out.println("invalid choice");
}
}
}
Output:
javac StackDemo.java
java StackDemo
Menu
1.Insert
2.Delete
3.First Element
4.Exit
Enter your choice:
1
Enter the element to insert:5
Elements in stack are:[5]
Menu
1.Insert
2.Delete 3.First
Element 4.Exit
Enter your choice:
1
Enter the element to insert:8
Elements in stack are:[5, 8]
Menu
1.Insert
2.Delete 3.First
Element 4.Exit
Enter your choice:
3
the first element in stack is 8
Menu
1. Insert
2.Delete 3.First
Element 4.Exit
Enter your choice:
2
Deleted element is 8
Remaining elements in stack are:[5]
8
lOMoARcPSD|372 403 83
C)Queues
import java.util.*;
import java.io.*;
switch(ch)
{
case 1: System.out.println("Enter element to insert into Queue:");int
ele=Integer.parseInt(din.readLine());
insert(q,ele);
break;
case 2:
delete(q);
break;
9
lOMoARcPSD|372 403 83
case 3: first(q);
break;
case 4: System.exit(0);
default: System.out.println("Invalid choice");
}
}
}
}
Output:
javac QueueDemo.java
java QueueDemo Menu
1.Insert
2. Delete
3.Peek
4.Exit
Enter your choice:
1
Enter element to insert into Queue:1
Elements in Queue: [1]
Menu
1.Insert
2.Delete
3.Peek
4.Exit
Enter your choice:
1
Enter element to insert into Queue:8
Elements in Queue: [1, 8]
Menu
1.Insert
2.Delete
3.Peek
4.Exit
Enter your choice:
3
First element on queue:1
10
lOMoARcPSD|372 403 83
D) Set
import java.util.Set;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.Iterator;
try{
for(int i =0; i<5; i++){
set.add(count[i]);
clone1.add(count[i]);
}
System.out.println(set);
Object s=clone1.clone();
System.out.println("the cloned set is"+s);
System.out.println("size of set is "+set.size());boolean
b=set.remove(30);
if(b)
System.out.println("element is removed");
System.out.println("the set display using iterator");
Iterator it=set.iterator();
while(it.hasNext())
System.out.println(it.next()+" ");
TreeSet sortedSet=new TreeSet<Integer>();
sortedSet.addAll(set);
System.out.println("The sorted list is:");
System.out.println(sortedSet);
System.out.println("the set is"+(TreeSet)sortedSet.headSet(40));
System.out.println("the subset is "+(TreeSet)sortedSet.subSet(22,40));
System.out.println("the tailset is "+(TreeSet)sortedSet.tailSet(22));
11
lOMoARcPSD|372 403 83
}
catch(Exception e){}
}
}
Output:
javac SetDemo.java
java SetDemo
[34, 22, 10, 30, 60]
the cloned set is[34, 22, 10, 60, 30]
size of set is 5
element is removed
the set display using iterator
34
22
10
60
The sorted list is:
[10, 22, 34, 60]
the set is[10, 22, 34]
the subset is [22, 34]
the tailset is [22, 34, 60]
The First element of the set is: 10
The last element of the set is: 60
12
lOMoARcPSD|372 403 83
E) Map
import java.util.HashMap;
import java.util.Map;
import java.util.Iterator;
import java.util.Set; import
java.util.TreeMap;public
class MapDemo {
public static void main(String args[]) {
13
lOMoARcPSD|372 403 83
if(hmap.containsKey(7))
System.out.println("key exists");
else
System.out.println("key does not exits");
if(hmap.containsValue("mouni"))
System.out.println("Value exists");
else
System.out.println("Value does not exits");
"+me.getValue()+"\n");
}
}
}
Output:
javac MapDemo.java
java MapDemo
the original hashmap is :{49=nsr, 2=pnr, 3=chari, 7=kry, 12=chp}
the cloned hashmap is:{2=pnr, 49=nsr, 3=chari, 7=kry, 12=chp}
the size of hashmap is5
key is: 49 & Value is: nsr
key is: 2 & Value is: pnr
key is: 3 & Value is: chari
key is: 7 & Value is: kry
key is: 12 & Value is: chp
Value at index 2 is: pnr
Map key and values after removal:
Key is: 49 & Value is: nsr
Key is: 2 & Value is: pnr
Key is: 7 & Value is: kry
Key is: 12 & Value is:
chpkey exists
Value does not exits
Key is: 2 & Value is:
14
lOMoARcPSD|372 403 83
15
lOMoARcPSD|372 403 83
https://www.virtualbox.org/wiki/Do https://drive.google.com/drive/folders/1KyzfpcSI_iJShS76BfWi
wnloads bBjuX2j0RBev
Please click on the following link to download CodeTantra' Hadoop Virtual Disk Image.
https://s3.ap-south-1.amazonaws.com/ct-hadoop-installation/CodeTantra-Hadoop-VDI.zip
Please find the Hadoop installation links and download the appropriate one according to your OS
architecture.
Hadoop 64bit
https://s3.ap-south-1.amazonaws.com/ct-hadoop-installation/CT-Hadoop-LinuxSetup-64bit-v3.zip
Hadoop 32bit
https://s3.ap-south-1.amazonaws.com/ct-hadoop-installation/CT-Hadoop-LinuxSetup-32bit-v3.zip
Cloudera Version4.7
https://gecgudlavallerumic-my.sharepoint.com/:f:/g/personal/bhagec_gecgudlavallerumic_in/EpfaD_p-
lKdNlu03RVBG9KoBsxMmsM8Lhg1VRY36solKhw?e=D1wk3A
2. installing Winscp :for file transfer between Server and Remote system
16
lOMoARcPSD|372 403 83
17
lOMoARcPSD|372 403 83
18
lOMoARcPSD|372 403 83
19
lOMoARcPSD|372 403 83
20
lOMoARcPSD|372 403 83
21
lOMoARcPSD|372 403 83
22
lOMoARcPSD|372 403 83
23
lOMoARcPSD|372 403 83
24
lOMoARcPSD|372 403 83
25
lOMoARcPSD|372 403 83
26
lOMoARcPSD|372 403 83
27
lOMoARcPSD|372 403 83
28
lOMoARcPSD|372 403 83
29
lOMoARcPSD|372 403 83
30
lOMoARcPSD|372 403 83
31
lOMoARcPSD|372 403 83
32
lOMoARcPSD|372 403 83
Connection
33
lOMoARcPSD|372 403 83
34
lOMoARcPSD|372 403 83
35
lOMoARcPSD|372 403 83
36
lOMoARcPSD|372 403 83
37
lOMoARcPSD|372 403 83
38
lOMoARcPSD|372 403 83
39
lOMoARcPSD|372 403 83
40
lOMoARcPSD|372 403 83
41
lOMoARcPSD|372 403 83
42
lOMoARcPSD|372 403 83
43
lOMoARcPSD|372 403 83
44
lOMoARcPSD|372 403 83
45
lOMoARcPSD|372 403 83
Simple way drag drop the files between Local/ your Desktop(left hand window) to remote Server(cluster-
Right hand side window).
46
lOMoARcPSD|372 403 83
5. Set the RAM memory as given below and click on Next. Approximately Half of the RAM need to
be allocated to Virtual Box Instance
6. Select Use an existing virtual hard drive file and click on Create.
48
lOMoARcPSD|372 403 83
System requirements:
This requires a 64-bit host OS and a virtualization product that can support a 64-bit guest OS.
Better to have 8GM RAM since we are using virtual box but 4GB is also fine for practice
Double Click on the “poweroff” button and you will be accessing Cloudera Manager
Cloudera Manager UserId/Password: cloudera/cloudera
49
lOMoARcPSD|372 403 83
LINUX COMMANDS
50
lOMoARcPSD|372 403 83
5
lOMoARcPSD|372 403 83
EXPERIMENT-2
AIM: Perform setting up and Installing in its three operating modes:
Standalone,
Pseudo distributed,
Fully distributed
A) Standalone
Step 4: After java installed, To check whether java is installed on your system or not give thebelow
command:
Command: java –version
hadoop-
env.shadd
java_home as
export JAVA_HOME=/usr/lib/jvm/java-6-openjdk-amd64
9.2.
open bashrc file and append the following:
52
lOMoARcPSD|372 403 83
export HADOOP_HOME=/usr/local/hadoop
export
PATH=$PATH:$HADOOP_HOME/bin export
PATH=$PATH:$HADOOP_HOME/sbin
#HADOOP VARIABLES END
Step10:
To check hadoop version give the below command
Command: hadoop version
53
lOMoARcPSD|372 403 83
54
lOMoARcPSD|372 403 83
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>mapred.job.tracker</name>
<value>localhost:8021</value>
</property>
55
lOMoARcPSD|372 403 83
Distributed mode:
</configuration>
56
lOMoARcPSD|372 403 83
</configuration>
</configuration>
57
lOMoARcPSD|372 403 83
HDFS Namenode on UI
Open the browser and type
http://locahost:50070/
58
lOMoARcPSD|372 403 83
59
lOMoARcPSD|372 403 83
HDFS Jobtracker
http://locahost:50030/
HDFS Logs
http://locahost:50070/logs/
60
lOMoARcPSD|372 403 83
HDFS Tasktracker
http://locahost:500
60/
61
lOMoARcPSD|372 403 83
EXPERIMENT-3
AIM: Implement the following file management tasks in Hadoop:
⮚ Adding files and directories
⮚ Retrieving files
⮚ Deleting files
Hint: A typical Hadoop workflow creates data files (such as log files) elsewhere and
copies them into HDFS using one of the above command line utilities.
Retrieving files:
62
lOMoARcPSD|372 403 83
Hadoop provides a set of command line utilities that work similarly to the Linux file commands.
Default directories
Local file system : /home/cloudera
HDFS : /user/cloudera
where cmd is the specific file command and <args> is the variable number of arguments
Example:
Command for listing files is:
hadoop fs –ls
Most common file management tasks in hadoop are—
• Adding files and directories
• Retrieving files
• Deleting files
i) Adding files and directories: Before running hadoop programs need to put the data into HDFS
first .
1. mkdir : Create a directory in HDFS at given path(s).
hadoop fs -mkdir <paths>
Example:
hadoop fs -mkdir /user/cloudera/myfolder1
(absolute path)
Or
hadoop fs –mkdir myfolder1
(relative path)
Create a sub directory
Example:
hadoop fs –mkdir /user/cloudera/myfolder1/subfolder1
Copy single src file, or multiple src files from local file system to the Hadoop distributed file system
63
lOMoARcPSD|372 403 83
Example
create two files in local filesystem using cat or using any editor nano or gedit
cat > file1
This is Hadoop Lab
Ctrl+Z
cat > file2
This is Bigdata Lab
Ctrl+Z
hadoop fs - put file1 /user/cloudera/myfolder1
hadoop fs -copyFromLocal file2 /user/cloudera/myfolder1/subfolder1
hadoop fs -put file3 . (put the file in the default directory
checking:
hadoop fs – lsr /user/cloudera/myfolder1
hadoop fs –ls /
Another way to access the data is to display it. We can use the Hadoop filecommand with unix pipes
to send its output for further processing.
hadoop fs –cat file1
hadoop fs –cat file1 | head
hadoop fs –tail file1 (display the last 1 kb of file1)
c) Deleting files
Hadoop command for removing files is rm
Example :
hadoop fs -rm file1
hadoop fs -rmr myfolder1 (remove directory recursively)
Looking Up Help
A list of hadoop file commands together with the usage and description of each command can see by
using help command.
hadoop fs -help cmd
Example :
hadoop fs –help ls
3. du : Shows disk usage, in bytes, for all the files which match path; filenames are reported
with the full HDFS protocol prefix.
hadoop fs -du <path>
Example:
hadoop fs -du /user/cloudera
4. dus : Like -du, but prints a summary of disk usage of all files/directories in the path.
hadoop fs - dus <path>
Example:
hadoop fs -dus /user/cloudera
5. moveFromLocal : files from local file system to the Hadoop distributed file system
hadoop fs –moveopyFromLocal localsrc dst
Move single src file, or multiple src files from local file system to the Hadoop distributed file system
Example
create afile in local filesystem using cat or using any editor nano or gedit
cat > file4
This is Hadoop and BigdataLab
Ctrl+Z
8.getmerge: concatenates the files in the source directory into the destination file.
hadoop fs -getmerge <src> <localdst> [addnl]
The addnl option is for adding new line character at the end of each file.
Example :
hadoop fs –getmerge file1 file2 mergfile
9. chown : used to change the ownership of files. The -R option can be used to recursively
change the owner of a directory structure.
hadoop fs -chown [-R] <NewOwnerName>[:NewGroupName] <file or dir name>
12. touchz: creates a zero byte file. This is similar to the touch command in unix.
hadoop fs -touchz /user/cloudera/filename
Example :
hadoop fs –touchz /user/cloudera/file0
66
lOMoARcPSD|372 403 83
EXPERIMENT-4
The input data set which can be a terabyte file broken down into chunks of 64 MB by default is the
input to Mapper function. The Mapper function then filters and sort these data chunks
on Hadoop cluster data nodes based on the business requirement.
After the distributed computation is completed, the output of the mapper function is passed to reducer
function which combines all the elements back together to provide the resulting output.
An example of Hadoop MapReduce usage is “word-count” algorithm in raw Java using classes
provided by Hadoop libraries. Count how many times a given word such as “are”, “Hole”, “the” exists
in a document which is the input file.
To begin, consider below figure, which breaks the word-count process into steps.
Hadoop MapReduce
Word Count Process
The building blocks of Hadoop MapReduce programs are broadly classified into two phases, the map
and reduce.
Both phases take input data in form of (key, value) pair and output data as (key, value) pair. The mapper
program runs in parallel on the data nodes in the cluster. Once map phase is over, reducer run in parallel
on data nodes.
Sort and shuffle stage creates the iterator for each key for e.g. (are, 1,1,1) which is passed to the reduce
function that sums up the values for each key to generate (K2, V2) as output. The illustration of the
same is shown in above figure (word count MapReduce process).
67
lOMoARcPSD|372 403 83
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
Step 2
The summary of the classes defined in the “word count map reduce” program is as below :
68
lOMoARcPSD|372 403 83
We have created a package in the eclipse and defined a class named “WordCount”. The “WordCount”
class has two nested class and one main class. “Mapper” and “Reducer” are the reserved keywords.
The source code for the same is written by Hadoop developer. We are extending the “Mapper” and
“Reducer” class by the “Map” and “Reduce” respectively using inheritance.
Let us understand what is LongWritable, Text, IntWritable. For the same, we need to first understand
serialization and de-serialization in java.
Object serialization is a mechanism where an object can be represented as a sequence of bytes that
includes the object’s data as well as information about the object’s type and the types of data stored in
the object.
The serialized object is written in a file and then de-serialized to recreate the object back into memory.
For example word “Hai” has a serializable value of say “0010110” and then once it is written in a file,
you can de-serialized back to “Hai”.
In Hadoop MapReduce framework, mapper output is feeding as reducer input. These intermediate
values are always in serialized form.
Serialization and de-serialization in java are called as Writable in Hadoop MapReduce programming.
Therefore, Hadoop developers have converted all the data types in serialized form. For example, Int in
java is IntWritable in MapReduce framework, String in java is Text in MapReduce framework and so
on.
The input and output of the mapper or reducer is in (key, value) format. For example, we have a file
which contains text input and text outputs say the sample data as (1, aaa). The key is considered to be
the precision of input data. The precision for (1, aaa) is defined as “01234”. 0 for “1”, 1 for “,” and so
on which makes it to “01234”.
Therefore, for a text input/output file, the precision of first value is considered to be as key and the rest
are values. In this case, “0” is considered as the key while as “(1, aaa)” as value.
Similarly, if you have another data in the file say (2, bbb). The precision for (1, bbb) is defined as
“56789”. Key here will be 5 and the value will be (1, bbb).
Consider, we have the first line in the file as “Hi! How are you”.
The mapper input key value is (0, Hi!), (4, How), (8, are), (12, you). Therefore, the key generated by
mapper class has a data type “LongWritable” i.e. the first parameter and the value generated by mapper
class is “Text”.
The mapper output value would be the word and the count of the word i.e. (Hi!,1), (How,1), (are,1),
(you, 1).
If the word “are” repeated twice in the sentence then the mapper output would be (are,1,1). Hence, the
key of the mapper output is “Text” while as the value is “IntWritable”. This output to the mapper is
69
lOMoARcPSD|372 403 83
getting
This output to the mapper is getting fed as the input to the reducer. Therefore, if the reducer input is
(are, 1, 1) then the output of the reducer will be (are,2). Here, the reducer output data type has the key
as “Text” and value as “IntWritable”.
Step 3
Define the map class. The key and value input pair have to be serializable by the framework and hence
need to implement the Writable interface.
Output pairs do not need to be of the same types as input pairsbbbbbbb. Output pairs are collected with
calls to context.
Inside the static class “map” we are declaring an object with the name “one” to store the incremental
value of the given word and the particular word is stored in the variable named “word”.
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
The above piece of code takes each line as an input and stores it into the variable “line”. StringTokenizer
allows an application to break a string into tokens. For example:
If the “tokenizer” variable has more number of tokens to count then the while loop will get open. The
context will take care of executing the for loop i.e. to read line by line of the file and store the output
70
lOMoARcPSD|372 403 83
as the particular word and their occurrences. For example: if you have “hai, hai, hai” then the context
will store (hai, 1, 1, 1)
Step 4
Reduce class will accept shuffled key-value pairs as input.The code then totals the values for the key-
value pairs with the same key and outputs the totaled key-value pairs; e.g. <word,3>
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritableval : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
Step 5
The main method sets up the Map Reduce configuration by defining the type of input. In this case, the
input is text.The code then defines the Map, Combine, and Reduce classes, as well as specifying the
input/output formats.
71
lOMoARcPSD|372 403 83
Step 6
The full Java code for the “word count” program is as below:
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritableval : values) {
72
lOMoARcPSD|372 403 83
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//import org.apache.hadoop.mapreduce.Counter;
public class WordCountMapper extends
Mapper<LongWritable, Text, Text, LongWritable> {
}*/
}
}
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
//import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
public class WordCountJob implements Tool{
private Configuration conf;
@Override
public Configuration getConf()
{
return conf;
}
@Override
public void setConf(Configuration conf)
{
this.conf=conf;
}
@Override
public int run(String []args)throws Exception
{
Implementation of Run a basic Word Count Map Reduce program to understand Map
Reduce Paradigm.
PROGRAM:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
75
lOMoARcPSD|372 403 83
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}}
77
lOMoARcPSD|372 403 83
EXPERIMENT-5
AIM: Write a Map Reduce program that mines weather data. Weather sensors collecting data every
hour at many locations across the globe gather a large volume of log data, which is a good candidate
for analysis with MapReduce, since it is semi structured and record-oriented.
Step1.3: Repeat it for all years starting from 1901 to 2020 by repeating step1.1,step1.2
Step1.4: Store all 1901.gz,1902.gz ................................................. 2020.gz files into a folder name it
as all in the /cloudera directory
Step1.5 (Merge All .gz files in all directory): in all directory which contains .gz files i.e
1901.gz,1902.gz................................................. 2020.gz files are once again merged to single file
with .gz as file extension and strore it as ncde,gz file by following command.
▪ $zcat 1901.gz 1902.gz 1903.gz ................ 2020.gz | gzip –c >ncdc.gz
Step1.6 Extract it by right click and rename it with ncdc.txt file
78
lOMoARcPSD|372 403 83
Step2: Now move the input file i.e ncdc.txt to HDFS using following command to HDFS with
terminal:
$hadoopfs -put <local input file path><hdfs path>
Make sure that in HDFS there should be any file already exist example:
$ hadoop fs -copyFromLocal /home/cloudera/Downloads/all/ncdc.txt /user/cloudera/data
Step3: Open java IDE Eclipse in Cloudera and create a java project with any name and create three
classes in the same java project with .java as class name ,open each class and write/ type code
available in Tom White prescribed textbook page number 46,47,and 48 code respectively
79
lOMoARcPSD|372 403 83
80
lOMoARcPSD|372 403 83
step5:Now, once the errors are resolved, right click on your project and save(export) it to some
location on local file system with .jar file extension
step6: Command to run MapReduce Program:
a. hadoop jar <jar location><packagename.driver class name><input file hdfs
path><output hdfs folder path>
ex: $ hadoop jar /home/cloudera/max.jar tem.com.MaxTemperature
/user/cloudera/data /ncdc.txt /user/cloudera/data/output
Step7: Check the Output by opening HDFS File browser in the folder /user/cloudera/data/output you
will get
outfile name:
if reducer is used: part-r-00000
if reducer is not used: part-m-00000 open it you will see
1901 317
1902 244
1903 289
1904 256
1905 283
1906 294
1907 283
81
lOMoARcPSD|372 403 83
EXPERIMENT – 6
AIM: Write a MapReduce program that mines weather data.
Weather sensors collecting data every hour at many locations across the globe gather a large
volume of log data, which is good candidate for analysis with MapReduce, since it is semi-
structured and record-oriented.
PROGRAM:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
public class MyMaxMin {
public static class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text,
Text> {
@Override
public void map(LongWritable arg0, Text Value, Context context) throws IOException,
InterruptedException {
String line = Value.toString();
if (!(line.length() == 0)) {
String date = line.substring(6, 14);
float temp_Min = Float.parseFloat(line.substring(22, 28).trim());
float temp_Max = Float.parseFloat(line.substring(32, 36).trim());
if (temp_Max > 35.0) {
context.write(new Text("Hot Day " + date),new
Text(String.valueOf(temp_Max)));
}
if (temp_Min < 10) {
context.write(new Text("Cold Day " + date),new
Text(String.valueOf(temp_Min)));
}
}
}
}
public static class MaxTemperatureReducer extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text Key, Iterator<Text> Values, Context context)throws
IOException, InterruptedException
82
lOMoARcPSD|372 403 83
{
String temperature = Values.next().toString();
context.write(Key, new Text(temperature));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "weather example");
job.setJarByClass(MyMaxMin.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path OutputPath = new Path(args[1]);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
83
lOMoARcPSD|372 403 83
output:
84
lOMoARcPSD|372 403 83
EXPERIMENT-7
AIM: Write a program to find how many flights between origin and destination by
using Map reduce.
PROGRAM:
package com.lbrce.flight;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
public class Flight {
public static class FlightMapper extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable arg0, Text Value, Context context) throws IOException,
InterruptedException
{
String line = Value.toString();
if (!(line.length() == 0))
{
String fno = line.substring(0, 4);
String origin=line.substring(8, 12).trim();
String dest =line.substring(13, 18).trim();
if(origin.equals("HYD")&&dest.equals("SAN"))
{
context.write(new Text("Flight " + fno),new Text("HYD SAN"));
}
}
}
}
public static class FlightReducer extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text Key, Iterator<Text> Values, Context context)throws IOException,
InterruptedException
{
String nof = Values.next().toString();
context.write(Key, new Text(nof));
}}
85
lOMoARcPSD|372 403 83
{
Configuration conf = new Configuration(); Job job = new Job(conf,
"weather example"); job.setJarByClass(Flight.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(FlightMapper.class);
job.setReducerClass(FlightReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class); Path OutputPath = new
Path(args[1]); FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? : 1);
}
}
Input:
Flight Origin Desti
Arrival Num
nation Time
--------------------------------------
AI111 HYD SAN 22:30
QA222 BOM NEY 24:26
SA333 DEL DAL 32:24
BA444 CHE SAN 42:15
SA555 HYD NEJ 24:26
QA666 BAN DAL 22:30
AI777 HYD SAN 32:24
SA888 DEL SAN 42:15
BA999 BAN NEY 32:24
SA123 BOM NEJ 24:26
QA321 CHE SAN 42:15
SA345 BAN DAL 24:26
AI456 CHE SAN 42:15
BA789 HYD SAN 22:30
QA156 BOM NEJ 32:24
SA234 BAN DAL 24:26
BA132 BOM NEJ 42:15
AI431 HYD SAN 22:30
AA001 CHE SAN 32:24
AA007 BOM NEJ 24:26
AA009 HYD SAN 24:26
DT876 BAN DAL 42:15
JT567 HYD SAN 22:30
86
lOMoARcPSD|372 403 83
EXPERIMENT – 8
AIM: Installation of Hive along with practice examples.
⮚ Create a file named jpox.properties and add the following lines into it:
javax.jdo.PersistenceManagerFactoryClass = org.jpox.PersistenceManagerFactoryImpl
org.jpox.autoCreateSchema = false
org.jpox.validateTables = false
org.jpox.validateColumns = false
87
lOMoARcPSD|372 403 83
org.jpox.validateConstraints = false
org.jpox.storeManagerType = rdbms
org.jpox.autoCreateSchema = true
org.jpox.autoStartMechanismMode = checked
org.jpox.transactionIsolation = read_committed
javax.jdo.option.DetachAllOnCommit = true
javax.jdo.option.NontransactionalRead = true
javax.jdo.option.ConnectionDriverName = org.apache.derby.jdbc.ClientDriver
javax.jdo.option.ConnectionURL = jdbc:derby://hadoop1:1527/metastore_db;create =
true
javax.jdo.option.ConnectionUserName = APP
javax.jdo.option.ConnectionPassword = mine
Example
We will insert the following data into the table. It is a text file named sample.txt in
/home/user directory.
1201 Gopal 45000 Technical manager
1202 Manisha 45000 Proof reader
1203 Masthanvali 40000 Technical writer
1204 Krian 40000 Hr Admin
88
lOMoARcPSD|372 403 83
+ + + + + +
| ID | Name | Salary | Designation | Dept |
+ + + + + +
|12 | Gopal | 45000 | Technical | |
01 manager TP
|12 | Manisha | 45000 | Proofreader | |
02 PR
|12 | | 40000 | Technical writer | |
03 Masthanval TP
i
|12 | Krian | 40000 | Hr Admin | |
04 HR
+ + + + + +
Functions:
89
lOMoARcPSD|372 403 83
+ + + + + +
| ID | Name | Salary | Designation | Dept |
+ + + + + +
|1201 | Gopal | 45000 | Technical manager | TP |
|1202 | Manisha | 45000 | Proofreader | PR |
|1203 | Masthanvali | 40000 | Technical writer | TP |
|1204 | Krian | 40000 | Hr Admin | HR |
|1205 | Kranthi | 30000 | Op Admin | Admin |
The following query retrieves the employee details using the above scenario:
hive> CREATE VIEW emp_30000 AS
> SELECT * FROM employee
> WHERE salary>30000;
⮚ Indexes:
The following query creates an index:
hive> CREATE INDEX inedx_salary ON TABLE employee(salary)
> AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler';
90
lOMoARcPSD|372 403 83
EXPERIMENT – 9
AIM: Install and Run Pig then write Pig Latin scripts to sort, group, join, project, and filter your
data.
PROCEDURE:
⮚ Download and extract pig-0.13.0.
Command: wget https://archive.apache.org/dist/pig/pig-0.13.0/pig-0.13.0.tar.gz
Command: tar xvf pig-0.13.0.tar.gz
Command: sudo mv pig-0.13.0 /usr/lib/pig
⮚ Set Path for pig
Command: sudo gedit
$HOME/.bashrc export
PIG_HOME=/usr/lib/pig
export PATH=$PATH:$PIG_HOME/bin
export PIG_CLASSPATH=$HADOOP_COMMON_HOME/conf
⮚ pig.properties file
In the conf folder of Pig, we have a file named pig.properties. In the pig.properties file,
you can set various parameters as given below.
pig -h properties
> Verifying the
Installation
Verify the installation of Apache Pig by typing the version command. If the installation is
successful, you will get the version of Apache Pig as shown below.
Command: pig -version
91
lOMoARcPSD|372 403 83
Grouping Of Data:
⮚ put dataset into hadoop
Command: hadoop fs -put pig/input/data.txt pig_data/
Joining Of Data:
⮚ Run pig script program of JOIN on hadoop mapreduce
grunt>
customers = LOAD 'hdfs://localhost:8020/user/pcetcse/pig_data/customers.txt'
USING PigStorage(',')as (id:int, name:chararray, age:int, address:chararray,
salary:int);
orders = LOAD 'hdfs://localhost:8020/user/pcetcse/pig_data/orders.txt' USING
PigStorage(',')as (oid:int, date:chararray, customer_id:int, amount:int)
92
lOMoARcPSD|372 403 83
(1,Rajiv,Reddy,21,9848022337,Hyderabad)
Filtering of data:
⮚ Run pig script program of FILTER on hadoop mapreduce
Assume that we have a file named student_details.txt in the HDFS directory /pig_data/
as shown below.
student_details.txt
001,Rajiv,Reddy,21,9848022337,Hyderabad
002,siddarth,Battacharya,22,9848022338,Kolkata
003,Rajesh,Khanna,22,9848022339,Delhi
004,Preethi,Agarwal,21,9848022330,Pune
005,Trupthi,Mohanthy,23,9848022336,Bhuwaneshwar
006,Archana,Mishra,23,9848022335,Chennai
007,Komal,Nayak,24,9848022334,trivendram
008,Bharathi,Nambiayar,24,9848022333,Chennai
And we have loaded this file into Pig with the schema name student_details as
shown below.
grunt>
student_details = LOAD
„hdfs://localhost:8020/user/pcetcse/pig_data/student_details.txt' USING
PigStorage(',')as (id:int, firstname:chararray, lastname:chararray, age:int,
phone:chararray, city:chararray);
Let us now use the Filter operator to get the details of the students who belong to the city
Chennai.
grunt> filter_data = FILTER student_details BY city == 'Chennai';
⮚ Verification
Verify the relation filter_data using the DUMP operator as shown below.
grunt> Dump filter_data;
⮚ Output
It will produce the following output, displaying the contents of the relation filter_data as
follows.
(6,Archana,Mishra,23,9848022335,Chennai)
(8,Bharathi,Nambiayar,24,9848022333,Chennai)
94
lOMoARcPSD|372 403 83
Grouping data
grunt> group1 = group data by age;
grunt> describe group1;
group1: {group: int,data: {(age: int)}}
grunt> dump group1;
(12,{(12)})
(19,{(19)})
(24,{(24),(24)})
(25,{(25)})
(27,{(27)})
(35,{(35),(35)})
(45,{(45)})
(55,{(55)})
(65,{(65)})
The data bag is grouped by ‘age’ therefore Group element contain unique values
To see how pig transforms data
grunt > ILLUSTRAGE group1;
95
lOMoARcPSD|372 403 83
Load Command
LOAD 'data' [USING function] [AS schema];
• data – name of the directory or file – Must be in single quotes
• USING – specifies the load function to use
– By default uses PigStorage which parses each line into fields
using a delimiter.
- Default delimiter is tab (“\t‟)
• AS – assign a schema to incoming data
– Assigns names to fields
– Declares types to fields
LOADING DATA:
• Create file in local file system
[cloudera@localhost ~]$ cat > a.txt
25
35
45
55
65
24
12
19
27
35
24
copy file from local file system to hdfs
[cloudera@localhost ~]$ hadoop fs -put a.txt
Pig Latin – Diagnostic Tools
• Display the structure of the Bag
grunt> DESCRIBE <bag_name>;
ex: DESCRIBE data;
• Display Execution Plan
– Produces Various reports
• Logical Plan
• MapReduce Plan
grunt> EXPLAIN <bag_name>;
ex: EXPLAIN data;
• Illustrate how Pig engine transforms the data
grunt> ILLUSTRATE <bag_name>;
ex: ILLUSTRATE data;
Filter data
grunt> grunt> filter1 = filter data by age > 30;
grunt> dump filter1;
(35)
(45)
(55)
(65)
(35)
96
lOMoARcPSD|372 403 83
97
lOMoARcPSD|372 403 83
The data bag is grouped by ‘age’ therefore Group element contain unique values
To see how pig transforms data
grunt > ILLUSTRAGE group1;
FOREACH
FOREACH<bag> GENERATE <data>
Iterates over each element in the bag and produce a result.
grunt> records = LOAD ‘std.txt’ USING PigStorage(‘ , ’) AS (roll:int, name:chararray);
grunt> dump records;
(501,aaa)
(502,hhh)
(507,yyy)
(204,rrr)
(510,bbb)
grunt> stdname = foreach records generate name;
grunt> dump stdname;
(aaa)
(hhh)
(yyy)
(rrr)
(bbb)
grunt> stdroll = foreach records generate roll;
grunt> dump stdroll;
(501)
(502)
(507)
(204)
(510)
JOIN:
The JOIN operator is used to combine records from two or more relations. While performing a join
operation, we declare one (or a group of) tuple(s) from each relation, as keys. When these keys match, the
two particular tuples are matched, else the records are dropped. Joins can be of the following types −
Self-join
Inner-join
Outer-join − left join, right join, and full join
Self-join
Self-join is used to join a table with itself.
Inner Join
Default Join is Inner Join – Rows are joined where the keys match – Rows that do not have matches
are not included in the result
Records which will not join with the ‘other’ record-set are still included in the result
Left Outer – Records from the first data-set are included whether they
have a match or not. Fields from the unmatched (second) bag are set to null.
Right Outer – The opposite of Left Outer Join: Records from the
second data-set are included no matter what. Fields from the
unmatched (first) bag are set to null.
lOMoARcPSD|372 403 83
Equi-join
inner Join is used quite frequently; it is also referred to as equijoin.
99
lOMoARcPSD|372 403 83
(,,,2,4)
(,,,2,7)
(,,,2,9)
(4,2,1,4,6)
(4,2,1,4,9)
(4,3,3,4,6)
(4,3,3,4,9)
(7,2,5,,)
(8,3,4,8,9)
(8,4,3,8,9)
100
lOMoARcPSD|372 403 83
(c,{(c),(c)})
(i,{(i),(i),(i)})
(k,{(k),(k),(k),(k)})
(l,{(l),(l)})
101
lOMoARcPSD|372 403 83
Output:
102
lOMoARcPSD|372 403 83
EXPERIMENT-10
AIM: Install and Run Hive then use Hive to create, alter, and drop databases, tables, views,
functions, and indexes
Installing Hive:
Q) Create tables emp and dept and load data from text files on hdfs.
hadoop fs -mkdir /user/chp/data
hadoop fs -put /home/chp/Desktop/hive_data/*.txt /user/chp/data
hive> create table emp(id int,name string,sal double) row format delimited fields terminatedby ',';
OK Time taken: 8.331 seconds
hive> show tables;
OK
Emp
hive> create table dept(eid int,dept string) row format delimited fields terminated by '@';OK
Time taken: 0.088 seconds
hive> load data inpath '/user/chp/data/faculty.txt' into table emp;hive>
load data inpath '/user/chp/data/dept.txt' into table dept;
103
lOMoARcPSD|372 403 83
Views:
Q) Create a view from emp table with the fields id and name.
1 chp
2 pnr
3 kry
Functions:
104
lOMoARcPSD|372 403 83
105
lOMoARcPSD|372 403 83
Index:
Create index:
hive>create index emp_index on table emp(name,sal) as
'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with deferred rebuild;
hive> create index dept_index on table dept(eid) as 'bitmap' with deferred rebuild;
bitmapdrop index:
106