owen-jia / lts-job Goto Github PK
View Code? Open in Web Editor NEWJAVA开发的分布式任务调度系统(light-task-scheduler),简称:LTS-JOB
License: Apache License 2.0
JAVA开发的分布式任务调度系统(light-task-scheduler),简称:LTS-JOB
License: Apache License 2.0
在job-tracker上需要支持mongodb,mysql并行
出现大量error日志,同时期出现大量warn日志,在代码的一块区域出现,两种存在必然关系。
这些异常都是出现在实时任务上,定时任务不会存在这样问题
code:
` private List fetchJob(String taskTrackerNodeGroup, String taskTrackerIdentity, int size) {
List jobPos = new ArrayList(size);
for (int i = 0; i < size; i++) {
// 从mongo 中取一个可运行的job
final JobPo jobPo = appContext.getPreLoader().take(taskTrackerNodeGroup, taskTrackerIdentity);
if (jobPo == null) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Job push failed: no job! nodeGroup=" + taskTrackerNodeGroup + ", identity=" + taskTrackerIdentity);
}
break;
}
// IMPORTANT: 这里要先切换队列
try {
appContext.getExecutingJobQueue().add(jobPo);
} catch (DupEntryException e) {
LOGGER.warn("ExecutingJobQueue already exist:" + JSON.toJSONString(jobPo));
appContext.getExecutableJobQueue().resume(jobPo);
continue;
}
appContext.getExecutableJobQueue().remove(jobPo.getTaskTrackerNodeGroup(), jobPo.getJobId());
jobPos.add(jobPo);
}
return jobPos;
}`
public JobPo take(String taskTrackerNodeGroup, String taskTrackerIdentity) { while (true) { JobPo jobPo = get(taskTrackerNodeGroup); if (jobPo == null) { DotLogUtils.dot("Empty JobQueue, taskTrackerNodeGroup:{}, taskTrackerIdentity:{}", taskTrackerNodeGroup, taskTrackerIdentity); return null; } // update jobPo PeriodUtils.start(); try { if (lockJob(taskTrackerNodeGroup, jobPo.getJobId(), taskTrackerIdentity, jobPo.getTriggerTime(), jobPo.getGmtModified())) { jobPo.setTaskTrackerIdentity(taskTrackerIdentity); jobPo.setIsRunning(true); jobPo.setGmtModified(SystemClock.now()); return jobPo; } } finally { PeriodUtils.end("AbstractPreLoader.lockJob taskId:{}", jobPo.getTaskId()); } } }
log:
[lts] 2020-03-10 05:16:14,093 ERROR [AbstractServerNode-thread-23] com.github.ltsopensource.core.logger.slf4j.Slf4jLogger.error(76) | [LTS] Error when lock job:Update SQL Error: UPDATE
lts_wjq_rentTaskTracker SET
is_running= ? ,
task_tracker_identity= ? ,
gmt_modified= ? WHERE job_id = ? AND is_running = ? AND trigger_time = ? AND gmt_modified = ?, lts version: 1.7.0, current host: 10.10.63.225 com.github.ltsopensource.store.jdbc.exception.JdbcException: Update SQL Error: UPDATE
lts_wjq_rentTaskTracker SET
is_running= ? ,
task_tracker_identity= ? ,
gmt_modified= ? WHERE job_id = ? AND is_running = ? AND trigger_time = ? AND gmt_modified = ? at com.github.ltsopensource.store.jdbc.builder.UpdateSql.doUpdate(UpdateSql.java:182) at com.github.ltsopensource.queue.mysql.MysqlPreLoader.lockJob(MysqlPreLoader.java:60) at com.github.ltsopensource.queue.AbstractPreLoader.take(AbstractPreLoader.java:150) at com.github.ltsopensource.jobtracker.sender.JobSender.fetchJob(JobSender.java:61) at com.github.ltsopensource.jobtracker.sender.JobSender.send(JobSender.java:33) at com.github.ltsopensource.jobtracker.complete.biz.PushNewJobBiz.getNewJob(PushNewJobBiz.java:47) at com.github.ltsopensource.jobtracker.complete.biz.PushNewJobBiz.doBiz(PushNewJobBiz.java:33) at com.github.ltsopensource.jobtracker.processor.JobCompletedProcessor.processRequest(JobCompletedProcessor.java:42) at com.github.ltsopensource.jobtracker.processor.RemotingDispatcher.doBiz(RemotingDispatcher.java:80) at com.github.ltsopensource.jobtracker.processor.RemotingDispatcher.processRequest(RemotingDispatcher.java:57) at com.github.ltsopensource.remoting.AbstractRemoting$1.run(AbstractRemoting.java:73) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: java.sql.SQLException: Deadlock found when trying to get lock; try restarting transaction Query: UPDATE
lts_wjq_rentTaskTracker SET
is_running = ? ,
task_tracker_identity = ? ,
gmt_modified = ? WHERE job_id = ? AND is_running = ? AND trigger_time = ? AND gmt_modified = ? Parameters: [true, TT_10.10.103.12_3620_13-32-53.502_2, 1583788574088, 387461ABB8DF40DBA4AEFE9BBC99868F, false, 1583788573998, 1583788573998] at com.github.ltsopensource.store.jdbc.dbutils.DbRunner.rethrow(DbRunner.java:232) at com.github.ltsopensource.store.jdbc.dbutils.DbRunner.update(DbRunner.java:84) at com.github.ltsopensource.store.jdbc.dbutils.DbRunner.update(DbRunner.java:60) at com.github.ltsopensource.store.jdbc.SqlTemplateImpl.update(SqlTemplateImpl.java:87) at com.github.ltsopensource.store.jdbc.SqlTemplateImpl$2.run(SqlTemplateImpl.java:76) at com.github.ltsopensource.store.jdbc.SqlTemplateImpl$2.run(SqlTemplateImpl.java:73) at com.github.ltsopensource.store.jdbc.SqlTemplateImpl.execute(SqlTemplateImpl.java:30) at com.github.ltsopensource.store.jdbc.SqlTemplateImpl.update(SqlTemplateImpl.java:73) at com.github.ltsopensource.store.jdbc.builder.UpdateSql.doUpdate(UpdateSql.java:180) ... 15 more
[lts] 2020-03-09 02:30:20,981 WARN [AbstractServerNode-thread-5] com.github.ltsopensource.core.logger.slf4j.Slf4jLogger.warn(56) | [LTS] ExecutingJobQueue already exist:{"cron":false,"extParams":{"date":"2020-03-09","ifMerge":"N","contractId":"798","customerId":"125","type":"contract","nodeId":"b67d7cd3b5d04ccb864718c0632c0db2"},"gmtCreated":1583692219264,"gmtModified":1583692220978,"internalExtParams":{"__LTS_Seq_Id":"1583692219264"},"jobId":"811CD55A5492458C88EB51B0391B5FE3","jobType":"REAL_TIME","lastGenerateTriggerTime":0,"maxRetryTimes":3,"needFeedback":false,"priority":100,"realTaskId":"contract:b67d7cd3b5d04ccb864718c0632c0db2-125-2020-03-09-false","relyOnPrevCycle":true,"repeatCount":0,"repeatInterval":0,"repeatable":false,"repeatedCount":0,"retryTimes":0,"running":true,"submitNodeGroup":"rentJobClient","taskId":"contract:b67d7cd3b5d04ccb864718c0632c0db2-125-2020-03-09-false","taskTrackerIdentity":"TT_10.10.103.12_31065_14-25-32.613_2","taskTrackerNodeGroup":"rentTaskTracker","triggerTime":1583692219264}, lts version: 1.7.0, current host: 10.10.63.225
fjq_*表删除循环报错:
{"action":"EXECUTE_SUCCESS","jobMeta":{"internalExtParams":{"__LTS_ONCE":"true","__LTS_Seq_Id":"1575502800000"},"job":{"cron":true,"cronExpression":"0 */20 * * * ?","extParams":{"type":"EquipmentOeePerformanceTimeStart"},"maxRetryTimes":2,"needFeedback":true,"priority":100,"relyOnPrevCycle":false,"repeatCount":0,"repeatInterval":0,"repeatable":false,"replaceOnExist":false,"submitNodeGroup":"WisEquipmentJobClient","taskId":"EquipmentOeePerformanceTimeStart_1205-074000","taskTrackerNodeGroup":"WisEquipmentTaskTracker","triggerTime":1575502800000},"jobId":"577CF1066DE3471088E4A3667C0616FE","jobType":"CRON","realTaskId":"EquipmentOeePerformanceTimeStart","repeatedCount":0,"retryTimes":0},"msg":"OK","time":1575502894606}
日志记录:
admin需要在mongodb和mysql并行支持
job的load本身就有scheduledFuture加载机制,且频率可以配置。
该段代码是单个节点下执行send,每次都触发所有节点的reload,逻辑上没有意义。
1.7.0代码
AbstractPreLoader:
`
private JobPo get(String taskTrackerNodeGroup) {
JobPriorityBlockingDeque queue = getQueue(taskTrackerNodeGroup);
int size = queue.size();
DotLogUtils.dot("AbstractPreLoader.queue size:{},taskTrackerNodeGroup:{}", size, taskTrackerNodeGroup);
**if (isInFactor(size)) {
// 触发加载的请求
if (!LOAD_SIGNAL.contains(taskTrackerNodeGroup)) {
LOAD_SIGNAL.add(taskTrackerNodeGroup);
doLoad();
}
}**
JobPo jobPo = queue.poll();
if (jobPo != null && jobPo.getPriority() == Integer.MIN_VALUE) {
if (CollectionUtils.isNotEmpty(jobPo.getInternalExtParams())) {
if (jobPo.getInternalExtParams().containsKey(Constants.OLD_PRIORITY)) {
try {
int priority = Integer.parseInt(jobPo.getInternalExtParam(Constants.OLD_PRIORITY));
jobPo.getInternalExtParams().remove(Constants.OLD_PRIORITY);
jobPo.setPriority(priority);
} catch (NumberFormatException ignored) {
}
}
}
}
return jobPo;
}`
节点组jobclient和tasktracker在自动创建上,大小写目前是敏感的,但是实际应用中配置人员会不小心弄混淆,建议去除大小写敏感,统一自动大写或者小写
Cycle Log 👍
[INFO ] [17:05:33] com.github.ltsopensource.jobtracker.support.checker.FeedbackJobSendChecker - [LTS] Send to client: 0 success, 0 failed., lts version: 1.6.9, current host: 172.20.30.117
....
This Bug is jobtracker at FeedbackJobSendChecker(161),
`do {
jobFeedbackPos = appContext.getJobFeedbackQueue().fetchTop(jobClientNodeGroup, limit);
if (CollectionUtils.isEmpty(jobFeedbackPos)) {
return;
}
List jobResults = new ArrayList(jobFeedbackPos.size());
for (JobFeedbackPo jobFeedbackPo : jobFeedbackPos) {
// 判断是否是过时的数据,如果是,那么移除
if (appContext.getOldDataHandler() == null ||
(!appContext.getOldDataHandler().handle(appContext.getJobFeedbackQueue(), jobFeedbackPo, jobFeedbackPo))) {
jobResults.add(new JobRunResultWrapper(jobFeedbackPo.getId(), jobFeedbackPo.getJobRunResult()));
}
}
// 返回发送成功的个数
int sentSize = clientNotifier.send(jobResults);
LOGGER.info("Send to client: {} success, {} failed.", sentSize, jobResults.size() - sentSize);
} while (jobFeedbackPos.size() > 0);`
and
public List<JobFeedbackPo> handle(ResultSet rs) throws SQLException { List<JobFeedbackPo> jobFeedbackPos = new ArrayList<JobFeedbackPo>(); while (rs.next()) { JobFeedbackPo jobFeedbackPo = new JobFeedbackPo(); jobFeedbackPo.setId(rs.getString("id")); jobFeedbackPo.setJobRunResult(JSON.parse(rs.getString("job_result"), new TypeReference<JobRunResult>() { })); jobFeedbackPo.setGmtCreated(rs.getLong("gmt_created")); jobFeedbackPos.add(jobFeedbackPo); } return jobFeedbackPos; }
when ResultSet rs = null or Empty,then jobFeedbackPos is Empty Array, so FeedbackJobSendChecker(162) will dead cycle.
1、实时任务中很多taskid含有时间,精确查询查不到;
2、模糊查询对用户更加友好,使用方便;
[rent] 2020-04-27 14:25:33,105 ERROR [RMI TCP Connection(3)-127.0.0.1] com.github.ltsopensource.core.logger.slf4j.Slf4jLogger.error(76) | [LTS] ========== Start failed, nodeType=JOB_CLIENT, identity=JC_169.254.99.112_13900_14-25-32.507_1, lts version: 1.7.0, current host: 169.254.99.112
java.lang.NoClassDefFoundError: org/I0Itec/zkclient/IZkStateListener
at com.github.ltsopensource.zookeeper.zkclient.ZkClientZookeeperTransporter.connect(ZkClientZookeeperTransporter.java:10) ~[lts-core-1.7.0.jar:?]
at com.github.ltsopensource.core.registry.zookeeper.ZookeeperRegistry.(ZookeeperRegistry.java:41) ~[lts-core-1.7.0.jar:?]
at com.github.ltsopensource.core.registry.RegistryFactory.getRegistry(RegistryFactory.java:20) ~[lts-core-1.7.0.jar:?]
at com.github.ltsopensource.core.cluster.AbstractJobNode.initRegistry(AbstractJobNode.java:210) ~[lts-core-1.7.0.jar:?]
at com.github.ltsopensource.core.cluster.AbstractJobNode.start(AbstractJobNode.java:80) ~[lts-core-1.7.0.jar:?]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_121]
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_121]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_121]
at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_121]
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.invokeCustomInitMethod(AbstractAutowireCapableBeanFactory.java:1719) ~[spring-beans-4.3.5.RELEASE.jar:4.3.5.RELEASE]
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.invokeInitMethods(AbstractAutowireCapableBeanFactory.java:1656) ~[spring-beans-4.3.5.RELEASE.jar:4.3.5.RELEASE]
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.initializeBean(AbstractAutowireCapableBeanFactory.java:1585) ~[spring-beans-4.3.5.RELEASE.jar:4.3.5.RELEASE]
某个node的服务器job push失败,会一直在执行任务表中不停的循环推送。
若是高频的task,每个5秒启动一个,就会导致task大量阻塞,从而导致cpu打满。
多个节点的jobtracker其中一台阻塞,导致所有节点都不能执行job分发。
目前我们是2台服务器部署,发现一台阻塞,另外一台就不能支持服务了,job都不能push。
推测是阻塞那台是master,另一台是slave,具体原因不知
任务在执行一段时间后发现任务不再执行。查看队列表lts_executing_job_queue发现队列表已满,都是相同的taskid的任务。
只有手动删除队列表的数据,重启服务才会再执行。
希望能增加自动处理的方式
1、列出能支持的所有配置文件中的参数
2、所有参数采用默认值进行配置
3、lts-admin、job-tracker、lts-monitor、task-tracker配置文件
jobtracker不存在,bug,
现象:
jobclient重复提交taskId相同的job,得到response一直会是success。实际上jobtracker并不会存储这些job,直接扔掉了,打印一条info日志提示出现了exist job。
JobSubmitProcessor:
` @OverRide
public RemotingCommand processRequest(Channel channel, RemotingCommand request) throws RemotingCommandException {
JobSubmitRequest jobSubmitRequest = request.getBody();
JobSubmitResponse jobSubmitResponse = appContext.getCommandBodyWrapper().wrapper(new JobSubmitResponse());
RemotingCommand response;
try {
appContext.getJobReceiver().receive(jobSubmitRequest);
response = RemotingCommand.createResponseCommand(
JobProtos.ResponseCode.JOB_RECEIVE_SUCCESS.code(), "job submit success!", jobSubmitResponse);
} catch (JobReceiveException e) {
LOGGER.error("Receive job failed , jobs = " + jobSubmitRequest.getJobs(), e);
jobSubmitResponse.setSuccess(false);
jobSubmitResponse.setMsg(e.getMessage());
jobSubmitResponse.setFailedJobs(e.getJobs());
response = RemotingCommand.createResponseCommand(
JobProtos.ResponseCode.JOB_RECEIVE_FAILED.code(), e.getMessage(), jobSubmitResponse);
}
return response;
}`
JobReceiver:
`private JobPo addToQueue(Job job, JobSubmitRequest request) {
JobPo jobPo = null;
boolean success = false;
BizLogCode code = null;
try {
jobPo = JobDomainConverter.convert(job);
if (jobPo == null) {
LOGGER.warn("Job can not be null。{}", job);
return null;
}
if (StringUtils.isEmpty(jobPo.getSubmitNodeGroup())) {
jobPo.setSubmitNodeGroup(request.getNodeGroup());
}
// 设置 jobId
jobPo.setJobId(JobUtils.generateJobId());
// 添加任务
addJob(job, jobPo);
success = true;
code = BizLogCode.SUCCESS;
} catch (DupEntryException e) {
// 已经存在
if (job.isReplaceOnExist()) {
Assert.notNull(jobPo);
success = replaceOnExist(job, jobPo);
code = success ? BizLogCode.DUP_REPLACE : BizLogCode.DUP_FAILED;
} else {
code = BizLogCode.DUP_IGNORE;
LOGGER.warn("Job already exist And ignore. nodeGroup={}, {}", request.getNodeGroup(), job);
}
} finally {
if (success) {
stat.incReceiveJobNum();
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Receive Job success. {}", job);
}
}
}
// 记录日志
jobBizLog(jobPo, code);
return jobPo;
}`
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.