- Install Java
sudo apt install default-jdk default-jre
- Install OpenSSH server and client
sudo apt install openssh-server openssh-client
- Add new user
sudo adduser hadoop
- Switch to new user
sudo su - hadoop
- Generate SSH public and private keys
ssh-keygen -t rsa
- Add SSH public key to authorized keys
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
- Change permissions of authorized keys
chmod 640 ~/.ssh/authorized_keys
- Verify SSH configuration
ssh localhost
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
- Extract tar file
tar -xvzf hadoop-3.3.6.tar.gz
- Rename directory
mv hadoop-3.3.6 hadoop
- Obtain OpenJDK directory
dirname $(dirname $(readlink -f $(which java)))
- Add environment variables to bash configuration
~/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HADOOP_HOME=/home/hadoop/hadoop
- Reload bash settings
source ~/.bashrc
- Confirm setup by using example MapReduce Java archive file to obtain word counts
mkdir input
cp $HADOOP_HOME/*.txt input
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar wordcount input output
cat output/*
- Set up local environment as above
- Add environment variables to bash configuration:
~/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HADOOP_HOME=/home/hadoop/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
- Reload bash settings
source ~/.bashrc
- Create node metadata directories
mkdir -p /home/hadoop/hadoop/hdfs/{namenode,datanode}
- Change to Hadoop configuration directory
cd hadoop/etc/hadoop
- Add Java environment variables to
hadoop-env.sh
file
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
- Edit configuration in
core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
- Edit configuration in
hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>file:///home/hadoop/hadoop/hdfs/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>file:///home/hadoop/hadoop/hdfs/datanode</value>
</property>
</configuration>
- Edit configuration in
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
- Edit configuration in
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
- Switch to Hadoop user account
sudo su - hadoop
- Remove previous nodes and format HDFS name node
cd ~
rm -R ~/hadoop/hdfs/namenode/*
rm -R ~/hadoop/hdfs/datanode/*
hdfs namenode -format
- Start distributed file cluster
start-dfs.sh
- Start resource manager
start-yarn.sh
- Check Java virtual machine process status
jps
- Access browser interfaces
- Name node information: http://localhost:9870
- Data node information: http://localhost:9864
- All applications: http://localhost:8088
- Create directory, copy, and view files on HDFS
hdfs dfs -mkdir /input
hdfs dfs -ls /
hdfs dfs -put ~/input/* /input
hdfs dfs -cat /input/*
- Browse directory on interface: http://localhost:9870/explorer.html
- Remove file/directory and recursively remove child files/subdirectories
hdfs dfs -rm -r /input
- Stop Hadoop cluster
stop-dfs.sh
- Stop resource manager
stop-yarn.sh
- Put files on HDFS for processing
hdfs dfs -mkdir /input
hdfs dfs -put ~/hadoop/etc/hadoop/*.xml /input
- Use example MapReduce Java archive file to find strings starting with 'dfs'
hadoop jar ~/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar grep /input /output 'dfs[a-z.]+'
hdfs dfs -cat /output/*
- Compile WordCount2.java and create Java archive
hadoop com.sun.tools.javac.Main WordCount2.java
jar cf wc2.jar WordCount2*.class
- Run word count MapReduce job on example files
hadoop fs -mkdir /input/example
hadoop fs -put ~/input/example/* /input/example
hadoop jar wc2.jar WordCount /input/example /wordcount/output
hadoop fs -cat /wordcount/output/*
- Re-run word count job with case sensitivity and pattern file
hadoop fs -put ~/wordcount/patterns.txt /wordcount
hadoop jar wc2.jar WordCount2 -Dwordcount.case.sensitive=true /input/example /wordcount/output2 -skip /wordcount/patterns.txt
hadoop fs -cat /wordcount/output2/*
- Missing name or data node
stop-dfs.sh
rm -R ~/hadoop/hdfs/namenode/*
rm -R ~/hadoop/hdfs/datanode/*
hdfs namenode -format
start-dfs.sh
jps
Apache Hadoop Project. (2023, June 18). Hadoop: Setting up a single node cluster. Apache Software Foundation.
Apache Hadoop Project. (2023, June 18). MapReduce tutorial. Apache Software Foundation.
Kumar, R. (2022, October 28). How to install and configure Hadoop on Ubuntu 20.04. Tec Admin.
Morumbasi, F. (2022, April 21). Installing Hadoop on Ubuntu 20.04. Medium.
Shamar, S. (2023, March 17). Install Hadoop on Ubuntu. Learn Ubuntu.
Tutorials Point. (2014, December). Hadoop Tutorial. Tutorials Point.