Hadoop Configuration File Templates

You can use the following configuration file templates:

core-site.xml

<configuration>
<property>
  <name>fs.default.name</name>
  <value>hdfs://${masterhost}:9000</value>
  <description>The name of the default file system.  This is a URI whose
  scheme and authority determine the FileSystem implementation.  The
  URI's scheme determines the config property (fs.SCHEME.impl) naming
  the FileSystem implementation class.  The URI's authority is used to
  determine the host, port, etc. for a filesystem.</description>
</property>

<property>
  <name>fs.checkpoint.dir</name>
  <value>${hadoop.tmp.dir}/dfs/namesecondary</value>
  <description>Determines where on the local filesystem the DFS secondary
      name node should store the temporary images to merge.
      If this is a comma-delimited list of directories then the image is
      replicated in all of the directories for redundancy.
  </description>
</property>

<property>
  <name>hadoop.tmp.dir</name>
  <value>/tmp/hadoop</value>
  <description>A base for other temporary directories.</description>
</property>

<property>
  <name>io.file.buffer.size</name>
  <value>65536</value>
  <description>The size of buffer for use in sequence files.
  The size of this buffer should probably be a multiple of hardware
  page size (4096 on Intel x86), and it determines how much data is
  buffered during read and write operations.</description>
</property>
</configuration>

mapred-site.xml

<configuration>
<property>
  <name>mapred.job.tracker</name>
  <value>${masterhost}:9001</value>
  <description>The host and port at which the MapReduce job tracker runs.
  If "local", then jobs are run in-process as a single map
  and reduce task.
  </description>
</property>

<property>
  <name>mapred.tasktracker.map.tasks.maximum</name>
  <value>2</value>
  <description>The maximum number of map tasks that will run
  simultaneously by a task tracker.
  </description>
</property>

<property>
  <name>mapred.tasktracker.reduce.tasks.maximum</name>
  <value>2</value>
  <description>The maximum number of reduce tasks that will run
  simultaneously by a task tracker.
  </description>
</property>

<property>
  <name>mapred.child.java.opts</name>
  <value>-Xmx500m</value>
  <description>Java opts for the task tracker child processes.  
  The following symbol, if present, will be interpolated: @taskid@ is replaced 
  by current TaskID. Any other occurrences of '@' will go unchanged.
  For example, to enable verbose gc logging to a file named for the taskid in
  /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
        -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc
  
  The configuration variable mapred.child.ulimit can be used to control the
  maximum virtual memory of the child processes. 
  </description>
</property>

<property>
  <name>mapred.local.dir</name>
  <value>${hadoop.tmp.dir}/mapred/local</value>
  <description>The local directory where MapReduce stores intermediate
  data files.  May be a comma-separated list of
  directories on different devices in order to spread disk I/O.
  Directories that do not exist are ignored.
  </description>
</property>

<property>
  <name>mapred.system.dir</name>
  <value>/system/mapred</value>
  <description>The shared directory where MapReduce stores control files.
  </description>
</property>

<property>
  <name>mapred.tasktracker.dns.interface</name>
  <value>default</value>
  <description>The name of the Network Interface from which a task
  tracker should report its IP address. (e.g. eth0)
  </description>
</property>
</configuration>

hdfs-site.xml

<configuration>
<property>
  <name>dfs.permissions</name>
  <value>false</value>
  <description>
    If "true", enable permission checking in HDFS.
    If "false", permission checking is turned off,
    but all other behavior is unchanged.
    Switching from one parameter value to the other does not change the mode,
    owner, or group of files or directories.
  </description>
</property>

<property>
  <name>dfs.data.dir</name>
  <value>${hadoop.tmp.dir}/dfs/data</value>
  <description>Determines where on the local filesystem an DFS data node
  should store its blocks.  If this is a comma-delimited
  list of directories, then data will be stored in all named
  directories, typically on different devices.
  Directories that do not exist are ignored.
  </description>
</property>

<property>
  <name>dfs.replication</name>
  <value>3</value>
  <description>Default block replication.
  The actual number of replications can be specified when the file is created.
  The default value is used if replication is not specified at creation time.
  </description>
</property>

<property>
  <name>dfs.name.dir</name>
  <value>${hadoop.tmp.dir}/dfs/name</value>
  <description>Determines where on the local filesystem the DFS name node
      should store the name table(fsimage).  If this is a comma-delimited list
      of directories then the name table is replicated in all of the
      directories, for redundancy. </description>
</property>

<property>
  <name>dfs.datanode.du.reserved</name>
  <value>10737418240</value>
  <description>Reserved space in bytes per volume. Always leave this much space free for non-DFS use.
  </description>
</property>

<property>
  <name>dfs.datanode.max.xcievers</name>
  <value>4096</value>
  <description>Upper limit for number of datanode threads.</description>
</property>

<property>
  <name>dfs.datanode.dns.interface</name>
  <value>default</value>
  <description>The name of the Network Interface from which a data node should
  report its IP address. (e.g. eth0)
  </description>
</property>
</configuration>