1. 進入Job提交方法

public boolean waitForCompletion(boolean verbose

) throws IOException, InterruptedException,

ClassNotFoundException {

// 判斷Job的狀態，如果

Runing，代表Job正在執行，不會重複提交

if (state == JobState.DEFINE) {

submit();

}

// 執行完後，列印執行的資訊

if (verbose) {

monitorAndPrintJob();

} else {

// get the completion poll interval from the client.

int completionPollIntervalMillis =

Job.getCompletionPollInterval(cluster.getConf());

while (!isComplete()) {

try {

Thread.sleep(completionPollIntervalMillis);

} catch (InterruptedException ie) {

}

return isSuccessful();

}

1.1 提交Job到Cluster

public void submit()

throws IOException, InterruptedException, ClassNotFoundException {

ensureState(JobState.DEFINE);

setUseNewAPI();

// 建立Cluster物件，包含兩個關鍵屬性： ①檔案系統，負責讀入資料到程式，寫出資料，儲存結果 ②執行Job的客戶端，如果Job執行方式是Local，使用LocalJobRunner，如果Job執行方式是YARN，使用YarnRunner

connect();

final JobSubmitter submitter =

getJobSubmitter(cluster.getFileSystem(), cluster.getClient());

status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {

public JobStatus run() throws IOException, InterruptedException,

ClassNotFoundException {

return submitter.submitJobInternal(Job.this, cluster);

}

});

state = JobState.RUNNING;

LOG.info("The url to track the job: " + getTrackingURL());

}

1.2 建立Cluster

private synchronized void connect()

throws IOException, InterruptedException, ClassNotFoundException {

//根據使用者的configuration，建立相應的Cluster，負責執行Job

if (cluster == null) {

cluster =

ugi.doAs(new PrivilegedExceptionAction<Cluster>() {

public Cluster run()

throws IOException, InterruptedException,

ClassNotFoundException {

return new Cluster(getConfiguration());

}

});

}

1.3 使用Submitter提交Job

JobStatus submitJobInternal(Job job, Cluster cluster)

throws ClassNotFoundException, InterruptedException, IOException {

// 驗證輸出目錄是否合法和存在

checkSpecs(job);

Configuration conf = job.getConfiguration();

addMRFrameworkToDistributedCache(conf);

// 獲取當前Job作業區域的路徑

Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);

//configure the command line options correctly on the submitting dfs

InetAddress ip = InetAddress.getLocalHost();

if (ip != null) {

submitHostAddress = ip.getHostAddress();

submitHostName = ip.getHostName();

conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);

conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);

}

JobID jobId = submitClient.getNewJobID();

job.setJobID(jobId);

// 如果本地提交：當前job的作業目錄在eclipse所在的工作空間，所在碟符的/tmp

// 在YARN上提交，需要HDFS來找/tmp

Path submitJobDir = new Path(jobStagingArea, jobId.toString());

JobStatus status = null;

try {

conf.set(MRJobConfig.USER_NAME,

UserGroupInformation.getCurrentUser().getShortUserName());

conf.set("hadoop.http.filter.initializers",

"org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");

conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());

LOG.debug("Configuring job " + jobId + " with " + submitJobDir

+ " as the submit dir");

// get delegation token for the dir

TokenCache.obtainTokensForNamenodes(job.getCredentials(),

new Path[] { submitJobDir }, conf);

populateTokenCache(conf, job.getCredentials());

// generate a secret to authenticate shuffle transfers

if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {

KeyGenerator keyGen;

try {

keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);

keyGen.init(SHUFFLE_KEY_LENGTH);

} catch (NoSuchAlgorithmException e) {

throw new IOException("Error generating shuffle secret key", e);

}

SecretKey shuffleKey = keyGen.generateKey();

TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),

job.getCredentials());

}

if (CryptoUtils.isEncryptedSpillEnabled(conf)) {

conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);

LOG.warn("Max job attempts set to 1 since encrypted intermediate" +

"data spill is enabled");

}

copyAndConfigureFiles(job, submitJobDir);

Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);

// Create the splits for the job

LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));

// 切片操作，產生split檔案和splitinfo，是對切片和對切片的說明資訊

// split記錄了當前輸入目錄中，所有檔案，切了幾片，每一片都是一個FileSplit物件

// splitinto對所有片資訊的說明，記錄了每一片，應該到哪個節點去讀取資料

int maps = writeSplits(job, submitJobDir);

// 設定mapreduce.job.maps 為切片數

conf.setInt(MRJobConfig.NUM_MAPS, maps);

LOG.info("number of splits:" + maps);

// write "queue admins of the queue to which job is being submitted"

// to job file.

String queue = conf.get(MRJobConfig.QUEUE_NAME,

JobConf.DEFAULT_QUEUE_NAME);

AccessControlList acl = submitClient.getQueueAdmins(queue);

conf.set(toFullPropertyName(queue,

QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

// removing jobtoken referrals before copying the jobconf to HDFS

// as the tasks don't need this setting, actually they may break

// because of it if present as the referral will point to a

// different job.

TokenCache.cleanUpTokenReferral(conf);

if (conf.getBoolean(

MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,

MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {

// Add HDFS tracking ids

ArrayList<String> trackingIds = new ArrayList<String>();

for (Token<? extends TokenIdentifier> t :

job.getCredentials().getAllTokens()) {

trackingIds.add(t.decodeIdentifier().getTrackingId());

}

conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,

trackingIds.toArray(new String[trackingIds.size()]));

}

// Set reservation info if it exists

ReservationId reservationId = job.getReservationId();

if (reservationId != null) {

conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());

}

// 將Job所有的配置資訊，寫入到Job.xml中！

writeConf(conf, submitJobFile);

// Now, actually submit the job (using the submit name)

printTokens(jobId, job.getCredentials());

// 正式準備提交Job

status = submitClient.submitJob(

jobId, submitJobDir.toString(), job.getCredentials());

if (status != null) {

return status;

} else {

throw new IOException("Could not launch job");

}

} finally {

if (status == null) {

LOG.info("Cleaning up the staging area " + submitJobDir);

if (jtFs != null && submitJobDir != null)

jtFs.delete(submitJobDir, true);

}

1.4 提交Job

public org.apache.hadoop.mapreduce.JobStatus submitJob(

org.apache.hadoop.mapreduce.JobID jobid, String jobSubmitDir,

Credentials credentials) throws IOException {

// 根據之前的準備工作，重構Job

Job job = new Job(JobID.downgrade(jobid), jobSubmitDir);

job.job.setCredentials(credentials);

return job.status;

}

1.5 建立LocalJobRunner可以執行的Job物件

public Job(JobID jobid, String jobSubmitDir) throws IOException {

……

// 將之前已經生成的Job執行的各種設定，重新賦值給LocalJobRunner$Job

// 開啟一個分執行緒來執行Job

this.start();

}

1.6 Job的run()

@Override

public void run() {

JobID jobId = profile.getJobID();

// JobContext代表Job執行的上下文，可以獲取Job中所有的配置資訊

JobContext jContext = new JobContextImpl(job, jobId);

org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = null;

try {

outputCommitter = createOutputCommitter(conf.getUseNewMapper(), jobId, conf);

} catch (Exception e) {

LOG.info("Failed to createOutputCommitter", e);

return;

}

try {

// 根據切片資訊，建立TaskSplitMetaInfo陣列，有幾片，陣列大小就是幾

TaskSplitMetaInfo[] taskSplitMetaInfos =

SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir);

int numReduceTasks = job.getNumReduceTasks();

outputCommitter.setupJob(jContext);

status.setSetupProgress(1.0f);

// 指定儲存所有MapTask輸出目錄的位置

Map<TaskAttemptID, MapOutputFile> mapOutputFiles =

Collections.synchronizedMap(new HashMap<TaskAttemptID, MapOutputFile>());

// 建立執行的MapTask程序列表

List<RunnableWithThrowable> mapRunnables = getMapTaskRunnables(

taskSplitMetaInfos, jobId, mapOutputFiles);

initCounters(mapRunnables.size(), numReduceTasks);

// 建立一個執行緒池

ExecutorService mapService = createMapExecutor();

// 執行所有的MapTask , 需要檢視MapTaskRunable的run()

runTasks(mapRunnables, mapService, "map");

try {

if (numReduceTasks > 0) {

List<RunnableWithThrowable> reduceRunnables = getReduceTaskRunnables(

jobId, mapOutputFiles);

ExecutorService reduceService = createReduceExecutor();

runTasks(reduceRunnables, reduceService, "reduce");

}

} finally {

for (MapOutputFile output : mapOutputFiles.values()) {

output.removeAll();

}

// delete the temporary directory in output directory

outputCommitter.commitJob(jContext);

status.setCleanupProgress(1.0f);

if (killed) {

this.status.setRunState(JobStatus.KILLED);

} else {

this.status.setRunState(JobStatus.SUCCEEDED);

}

JobEndNotifier.localRunnerNotification(job, status);

} catch (Throwable t) {

try {

outputCommitter.abortJob(jContext,

org.apache.hadoop.mapreduce.JobStatus.State.FAILED);

} catch (IOException ioe) {

LOG.info("Error cleaning up job:" + id);

}

status.setCleanupProgress(1.0f);

if (killed) {

this.status.setRunState(JobStatus.KILLED);

} else {

this.status.setRunState(JobStatus.FAILED);

}

LOG.warn(id, t);

JobEndNotifier.localRunnerNotification(job, status);

} finally {

try {

fs.delete(systemJobFile.getParent(), true); // delete submit dir

localFs.delete(localJobFile, true); // delete local copy

// Cleanup distributed cache

localDistributedCacheManager.close();

} catch (IOException e) {

LOG.warn("Error cleaning up "+id+": "+e);

}

2. 進入Map階段

2.1 進入MapTaskRunable的run()

public void run() {

try {

// 生成當前Task任務的id

TaskAttemptID mapId = new TaskAttemptID(new TaskID(

jobId, TaskType.MAP, taskId), 0);

LOG.info("Starting task: " + mapId);

mapIds.add(mapId);

MapTask map = new MapTask(systemJobFile.toString(), mapId, taskId,

info.getSplitIndex(), 1);

map.setUser(UserGroupInformation.getCurrentUser().

getShortUserName());

setupChildMapredLocalDirs(map, localConf);

// 建立當前MapTask 輸出的檔案物件

MapOutputFile mapOutput = new MROutputFiles();

mapOutput.setConf(localConf);

mapOutputFiles.put(mapId, mapOutput);

map.setJobFile(localJobFile.toString());

localConf.setUser(map.getUser());

map.localizeConfiguration(localConf);

map.setConf(localConf);

try {

map_tasks.getAndIncrement();

myMetrics.launchMap(mapId);

// 進入MapTask的run()

map.run(localConf, Job.this);

myMetrics.completeMap(mapId);

} finally {

map_tasks.getAndDecrement();

}

LOG.info("Finishing task: " + mapId);

} catch (Throwable e) {

this.storedException = e;

}

2.2 MapTask的run()

@Override

public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)

throws IOException, ClassNotFoundException, InterruptedException {

this.umbilical = umbilical;

// 判斷是否需要reduce階段

// Map階段，可以分為兩個階段：

map: 呼叫Mapper的map()方法，對輸入的key-value進行處理

// sort ：當map()處理完，context.wirte()，將key-value儲存到檔案中！

// 在儲存到檔案之前，會將所有的key-value進行排序，會經過排序的階段

if (isMapTask()) {

// If there are no reducers then there won't be any sort. Hence the map

// phase will govern the entire attempt's progress.

if (conf.getNumReduceTasks() == 0) {

mapPhase = getProgress().addPhase("map", 1.0f);

} else {

// If there are reducers then the entire attempt's progress will be

// split between the map phase (67%) and the sort phase (33%).

mapPhase = getProgress().addPhase("map", 0.667f);

sortPhase = getProgress().addPhase("sort", 0.333f);

}

TaskReporter reporter = startReporter(umbilical);

boolean useNewApi = job.getUseNewMapper();

initialize(job, getJobID(), reporter, useNewApi);

// check if it is a cleanupJobTask

if (jobCleanup) {

runJobCleanupTask(umbilical, reporter);

return;

}

if (jobSetup) {

runJobSetupTask(umbilical, reporter);

return;

}

if (taskCleanup) {

runTaskCleanupTask(umbilical, reporter);

return;

}

if (useNewApi) {

runNewMapper(job, splitMetaInfo, umbilical, reporter);

} else {

runOldMapper(job, splitMetaInfo, umbilical, reporter);

}

done(umbilical, reporter);

}

2.3 執行Mapper

@SuppressWarnings("unchecked")

private <INKEY,INVALUE,OUTKEY,OUTVALUE>

void runNewMapper(final JobConf job,

final TaskSplitIndex splitIndex,

final TaskUmbilicalProtocol umbilical,

TaskReporter reporter

) throws IOException, ClassNotFoundException,

InterruptedException {

// MapTask 的上下文物件

org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =

new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,

getTaskID(),

reporter);

// 例項化Mapper物件

org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =

(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)

ReflectionUtils.newInstance(taskContext.getMapperClass(), job);

// 獲取輸入格式

org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =

(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)

ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);

//重建當前的切片

org.apache.hadoop.mapreduce.InputSplit split = null;

split = getSplitDetails(new Path(splitIndex.getSplitLocation()),

splitIndex.getStartOffset());

LOG.info("Processing split: " + split);

// 負責初始化RecordReader

// 在NewTrackingRecordReader構造方法中，為真正讀取記錄的RecordReader進行賦值

org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =

new NewTrackingRecordReader<INKEY,INVALUE>

(split, inputFormat, reporter, taskContext);

job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());

// output負責寫出MapTask產生的key-value

org.apache.hadoop.mapreduce.RecordWriter output = null;

// get an output object

if (job.getNumReduceTasks() == 0) {

output =

// 如果沒用Reduce階段，使用一個直接輸出的記

Job提交流程原始碼和切片原始碼詳解

1. 進入Job提交方法 public boolean waitForCompletion(boolean verbose

大資料-Hadoop生態(13)-MapReduce框架原理--Job提交原始碼和切片原始碼解析

1.MapReduce的資料流 1) Input -> Mapper階段輸入源是一個檔案,經過InputFormat之後,到了Mapper就成了K,V對,以上一章的流量案例來說,經過InputFormat之後,變成了手機號為key,這一行資料為value的K,V對,所以這裡我們可以自定義Inp

RHEL6啟動服務流程service和chkconfig區別詳解

在RHEL6系統中是利用service和chkconfig來管理服務的，而在RHEL7中則是用systemctl來管理服務，RHEL7利用systemctl管理服務大部分都是呼叫的以前RHEL6中的啟動指令碼，感覺就是把RHLE6裡的東西封裝了一下，沒多大實質的

Hadoop之job提交流程原始碼簡析

1. 進入Job提交方法 public boolean waitForCompletion(boolean verbose

STL原始碼剖析——stack的實現原理和使用方法詳解

Stack 簡介 stack 是堆疊容器，是一種“先進後出”的容器。 stack 是簡單地裝飾 deque 容器而成為另外一種容器。使用 stack 時需要加上標頭檔案 #include<s

STL原始碼剖析——deque的實現原理和使用方法詳解

Deque 簡介 deque是“double—ended queue”的縮寫，和vector一樣都是STL的容器，deque 是雙端陣列，而 vector 是單端的。 deque 在介面上和 vector 非常相似，在許多操作的地方

大資料教程（8.6）yarn客戶端提交job的流程梳理和總結&自定義partition程式設計

上一篇部落格博主分享了mapreduce的並行原理，本篇部落格將繼續分享yarn客戶端提交job的流程和自定義partition程式設計。一、

spark core原始碼分析15 Shuffle詳解－寫流程

Shuffle是一個比較複雜的過程，有必要詳細剖析一下內部寫的邏輯 ShuffleManager分為SortShuffleManager和HashShuffleManager 一、SortShu

微信小程式前端原始碼邏輯和工作流詳解

看完微信小程式的前端程式碼真的讓我熱血沸騰啊，程式碼邏輯和設計一目瞭然，沒有多餘的東西，真的是大道至簡。廢話不多說，直接分析前端程式碼。個人觀點，難免有疏漏，僅供參考。檔案基本結構：先看入口app.js，app(obj)註冊一個小程式。接受一個 obje

job觸發流程原理剖析與原始碼分析

以wordcount流程解析 val lines = sc.textFile() def textFile( path: String, minPartitions:

Java IO：FileInputStream和FileOutputStream使用詳解及原始碼分析

1 使用方法　　FileInputStream即檔案輸入流，使用它從檔案中獲得位元組流，FileOutputStream即問價輸出流，使用它將位元組流寫入檔案。 1.1 方法介紹　　FileInputStream提供的API如下： FileI

在Linux上進行原始碼編譯安裝程式詳解

文章轉載自：http://xuweitao.blog.51cto.com/11761672/1905357 1. 編譯安裝概述前面兩篇關於程式包管理器的文章談到，無論是使用rpm命令還是yum命令安裝的都是已編譯好的程式包，在整個安裝過程中使用者只需執行一條命令即可完成安裝。這樣帶

Java定時任務Timer排程器【一】原始碼分析（圖文詳解版）

就以鬧鐘的例子開頭吧（後續小節皆以鬧鐘為例，所有原始碼只列關鍵部分）。 public class ScheduleDemo { public static void main(String[] args) throws InterruptedException {

Linux從原始碼編譯安裝程式詳解

1.原始碼編譯概述 1.1 使用原始碼安裝軟體的優點：獲得最新的軟體版本，及時修復bug 根據使用者需要，靈活定製軟體功能 1.2 應用場合舉例安裝較新版本的應用程式時當前安裝的程式無法滿足需要時需要為應用程式新增新的功能時

RestTemplate原始碼解析及使用詳解

一、RestTemplate服務呼叫在前面Eureka的服務註冊與發現中，我們使用了一個非常有用的物件RestTemplate。該物件會使用Ribbon的自動化配置，同時通過配置@LoadBalanced還能開啟客戶端負載均衡。這裡我們詳細介紹RestTemplate針對不同請求型別和引數型別

【Java】HashMap原始碼分析——常用方法詳解

上一篇介紹了HashMap的基本概念，這一篇著重介紹HasHMap中的一些常用方法：put()get()**resize()** 首先介紹resize()這個方法，在我看來這是HashMap中一個非常重要的方法，是用來調整HashMap中table的容量的，在很多操作中多需要重新計算容量。原始碼如下： 1

Spark啟動流程與job提交流程

Driver端首先啟動SparkSubmit程序，啟動後開始於Master進行通訊，此時建立了了一個非常重要的物件（SparkContext），接著向Master傳送任務資訊； Master接收到資訊後。開始資源排程，此時會和所有的Worker進行通訊，找到比較空閒的Worker，並通知Worker來取任務

Boost.ASIO原始碼：service_registry::use_service()詳解以及相關type_traits解析

這都是神仙寫的程式碼吧沒什麼，這個標題只是忍不住表達一下對ASIO的驚歎。曾經看《STL原始碼剖析》對裡面的type_traits的設計驚為天人，沒想到看ASIO庫的時候又看到了同樣的設計模式，雖然對於C++功底還不深的我來說看起來十分的費勁，但我還是決定好好的自己理解一遍，並把它記

死磕Netty原始碼之記憶體分配詳解(四)PoolArena全域性記憶體分配

記憶體分配全域性分配記憶體池的初始階段執行緒是沒有記憶體快取的，所以最開始的記憶體分配都需要在全域性分配區進行分配全域性分配區的記憶體構造和執行緒私有分配區的類似(包含Tiny、Small、Normal幾種規模計算索引的方式也都是一模一樣的

死磕Netty原始碼之記憶體分配詳解(三)PoolThreadCache執行緒快取記憶體分配

記憶體分配執行緒私有分配在介紹PoolArena記憶體分配結構分析的時候提到記憶體分配會先從執行緒快取裡分配，這個執行緒快取其實就是PoolThreadCache PoolThreadCache 成員變數 final PoolA

Job提交流程原始碼和切片原始碼詳解

1. 進入Job提交方法

1.1 提交Job到Cluster

1.2 建立Cluster

1.3 使用Submitter提交Job

1.4 提交Job

1.5 建立LocalJobRunner可以執行的Job物件

1.6 Job的run()

2. 進入Map階段

2.1 進入MapTaskRunable的run()

2.2 MapTask的run()

2.3 執行Mapper

相關推薦