I am trying to install and run Hadoop in distributed mode (HDFS). I use Terraform to manage my GCP cluster. (1 master node, 3 worker nodes)
This is my hadoop_cluster.nix:
{ config, pkgs, lib, ... }:
let cfg = config.services.hadoopCluster;
in
with lib;
{
options.services.hadoopCluster = {
enable = mkOption {
type = types.bool;
default = false;
description = "Whether to run a Hadoop node";
};
master = mkOption {
type = types.bool;
description = "Whether it is a master node";
};
};
config = mkIf cfg.enable {
users.groups.hadoop = {};
users.users.hadoop = {
group = "hadoop";
createHome = true;
home = "/home/hadoop";
};
system.activationScripts = {
hadoopGroupRWX = {
text = "chmod -R g+rwx /home/hadoop";
deps = [];
};
};
services.hadoop = {
hdfs.namenode.enabled = cfg.master;
hdfs.datanode.enabled = !cfg.master;
yarn.nodemanager.enabled = cfg.master;
yarn.resourcemanager.enabled = cfg.master;
coreSite = {
"fs.defaultFS" = "hdfs://${master_ip}:9000";
"yarn.scheduler.capacity.root.queues" = "default";
"yarn.scheduler.capacity.root.default.capacity" = 100;
};
hdfsSite = {
"dfs.namenode.name.dir" = "/home/hadoop/data/nameNode";
"dfs.datanode.data.dir" = "/home/hadoop/data/dataNode";
"dfs.replication" = 1;
};
yarnSite = {
#"yarn.nodemanager.hostname" = "${master_ip}";
"yarn.resourcemanager.hostname" = "${master_ip}";
"yarn.nodemanager.log-dirs" = "/home/hadoop/logs/nodemanager";
"yarn.nodemanager.aux-services" = "mapreduce_shuffle";
"yarn.acl.enable" = 0;
};
mapredSite = {
"mapreduce.framework.name" = "yarn";
"yarn.app.mapreduce.am.env" = "HADOOP_MAPRED_HOME=$HADOOP_HOME";
"mapreduce.map.env" = "HADOOP_MAPRED_HOME=$HADOOP_HOME";
"mapreduce.reduce.env" = "HADOOP_MAPRED_HOME=$HADOOP_HOME";
};
package = pkgs.hadoop_3_1.overrideAttrs (oldAttrs: { installPhase = builtins.replaceStrings ["HADOOP_PREFIX"] ["HADOOP_HOME"] oldAttrs.installPhase; });
};
};
}
Inside the nix expression for the master node, master_host.nix, I have the following (snippet):
imports = [ ./hadoop_cluster.nix ];
services.hadoopCluster = {
enable = true;
master = true;
};
And in the slave_host.nix:
imports = [ ./hadoop_cluster.nix ];
services.hadoopCluster = {
enable = true;
master = false;
};
My question is what did I do wrong? What is the proper setup? I’ve also set up Hive on top of Hadoop, but every time I run a MapReduce job it runs Hadoop Locally, the worker nodes are sitting idle.