run.sh
#!/bin/bash# Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.current_dir=`dirname "$0"`current_dir=`cd "$current_dir"; pwd`root_dir=${current_dir}/../../../../../workload_config=${root_dir}/conf/workloads/sql/scan.conf. "${root_dir}/bin/functions/load_bench_config.sh"enter_bench ScalaSparkScan ${workload_config} ${current_dir}show_bannar start# prepare SQLHIVEBENCH_SQL_FILE=${WORKLOAD_RESULT_FOLDER}/rankings_uservisits_scan.hiveprepare_sql_scan ${HIVEBENCH_SQL_FILE}START_TIME=`timestamp`rmr_hdfs $OUTPUT_HDFSrun_spark_job com.intel.hibench.sparkbench.sql.ScalaSparkSQLBench ScalaScan ${HIVEBENCH_SQL_FILE}END_TIME=`timestamp`sleep 5SIZE=`dir_size $OUTPUT_HDFS`gen_report ${START_TIME} ${END_TIME} ${SIZE:-0}show_bannar finishleave_bench
workload_functions.sh
function run_spark_job() { LIB_JARS= while (($#)); do if [ "$1" = "--jars" ]; then LIB_JARS="--jars $2" shift 2 continue fi break done CLS=$1 shift export_withlog SPARKBENCH_PROPERTIES_FILES YARN_OPTS="" if [[ "$SPARK_MASTER" == yarn-* ]]; then export_withlog HADOOP_CONF_DIR YARN_OPTS="--num-executors ${YARN_NUM_EXECUTORS}" if [[ -n "${YARN_EXECUTOR_CORES:-}" ]]; then YARN_OPTS="${YARN_OPTS} --executor-cores ${YARN_EXECUTOR_CORES}" fi if [[ -n "${SPARK_YARN_EXECUTOR_MEMORY:-}" ]]; then YARN_OPTS="${YARN_OPTS} --executor-memory ${SPARK_YARN_EXECUTOR_MEMORY}" fi if [[ -n "${SPAKR_YARN_DRIVER_MEMORY:-}" ]]; then YARN_OPTS="${YARN_OPTS} --driver-memory ${SPARK_YARN_DRIVER_MEMORY}" fi fi if [[ "$CLS" == *.py ]]; then LIB_JARS="$LIB_JARS --jars ${SPARKBENCH_JAR}" SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --master ${SPARK_MASTER} ${YARN_OPTS} ${CLS} $@" else SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --class ${CLS} --master ${SPARK_MASTER} ${YARN_OPTS} ${SPARKBENCH_JAR} $@" fi echo -e "${BGreen}Submit Spark job: ${Green}${SUBMIT_CMD}${Color_Off}" MONITOR_PID=`start_monitor` execute_withlog ${SUBMIT_CMD} result=$? stop_monitor ${MONITOR_PID} if [ $result -ne 0 ] then echo -e "${BRed}ERROR${Color_Off}: Spark job ${BYellow}${CLS}${Color_Off} failed to run successfully." echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n" tail ${WORKLOAD_RESULT_FOLDER}/bench.log exit $result fi}
ScalaSparkSQLBench.scala
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package com.intel.hibench.sparkbench.sqlimport org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.sql.hive.HiveContext/* * ported from HiBench's hive bench */object ScalaSparkSQLBench{ def main(args: Array[String]){ if (args.length < 2){ System.err.println( s"Usage: $ScalaSparkSQLBench" ) System.exit(1) } val workload_name = args(0) val sql_file = args(1) val sparkConf = new SparkConf().setAppName(workload_name) val sc = new SparkContext(sparkConf) val hc = new HiveContext(sc) val _sql = scala.io.Source.fromFile(sql_file).mkString _sql.split(';').foreach { x => if (x.trim.nonEmpty) hc.sql(x) } sc.stop() }}
HiveData.java
package HiBench;import java.io.IOException;import java.net.URISyntaxException;import java.util.HashMap;import java.util.Iterator;import java.util.Random;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapFileOutputFormat;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.SequenceFileInputFormat;import org.apache.hadoop.mapred.SequenceFileOutputFormat;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;import org.apache.hadoop.mapred.lib.MultipleInputs;import org.apache.hadoop.mapred.lib.NLineInputFormat;public class HiveData { private static final Log log = LogFactory.getLog(HiveData.class.getName()); private static final String RANKINGS = "rankings"; private static final String USERVISITS = "uservisits"; public static final String uagentf = "user_agents"; public static final String countryf = "country_codes"; public static final String searchkeyf = "search_keys"; private DataOptions options; private long visits; // client side delim private String cdelim = ","; private int chashsize = 150 * 1024 * 1024; private Dummy dummy; HiveData(DataOptions options) { this.options = options; parseArgs(options.getRemainArgs()); } private void parseArgs(String[] args) { for (int i=0; ioptions.getNumPages()) { chashsize = (int) options.getNumPages(); } } private void setRankingsOptions(JobConf job) throws URISyntaxException { job.setLong("pages", options.getNumPages()); job.setLong("slotpages", options.getNumSlotPages()); job.set("delimiter", cdelim); job.setInt("hashsize", chashsize); Utils.shareLinkZipfCore(options, job); } private void setVisitsOptions(JobConf job) { job.setInt("slots", options.getNumMaps()); job.setLong("pages", options.getNumPages()); job.setLong("visits", visits); job.set("delimiter", cdelim); } public static class DummyToRankingsMapper extends MapReduceBase implements Mapper { private static final Log log = LogFactory.getLog(DummyToRankingsMapper.class.getName()); private HtmlCore generator; private long pages, slotpages; private boolean outset; private OutputCollector myout; private JoinBytesInt uitem, ritem; private short[] hash; private HashMap hm; private int hashsize; private void getOptions(JobConf job) { pages = job.getLong("pages", 0); slotpages = job.getLong("slotpages", 0); hashsize = job.getInt("hashsize", 0); } public void configure(JobConf job) { getOptions(job); try { generator = new HtmlCore(job); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } outset = false; myout = null; uitem = new JoinBytesInt(); uitem.url = new byte[HtmlCore.getMaxUrlLength()]; ritem = new JoinBytesInt(); ritem.refs = 1; hash = new short[hashsize]; hm = new HashMap (); } public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { if (!outset) { myout = output; outset = true; } int slotId = Integer.parseInt(value.toString().trim()); generator.fireRandom(slotId); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); /** * For output collect */ for (long i=range[0]; i =0) { if (hash[iid]==HtmlCore.MAX_SHORT) { hm.put(iid, (int) (hash[iid]) + 1); hash[iid] = -1; } else { hash[iid]++; } } else { hm.put(iid, hm.get(iid) + 1); } } else { key.set(uid); output.collect(key, ritem); } } if (0==(i % 10000)) { log.info("still running: " + (i - range[0]) + " of " + slotpages); } } } @Override public void close () { try { LongWritable k = new LongWritable(); for (int i=0; i 0) { k.set(i); ritem.refs = hash[i]; myout.collect(k, ritem); } else if (hash[i] < 0) { k.set(i); ritem.refs = hm.get(i); myout.collect(k, ritem); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static class JoinBytesIntCombiner extends MapReduceBase implements Reducer {// Log log = null; JoinBytesInt item; @Override public void configure (JobConf job) { item = new JoinBytesInt();// log = LogFactory.getLog(JoinBytesIntCombiner.class.getName()); } @Override public void reduce(LongWritable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { item.clear();// StringBuffer sb = new StringBuffer("Combine: " + v.toString()); while (values.hasNext()) { item.add(values.next());// sb.append("-> " + v.toString()); } output.collect(key, item);// log.info(sb); } } public static class GenerateRankingsReducer extends MapReduceBase implements Reducer { private static final Log log = LogFactory.getLog(GenerateRankingsReducer.class.getName()); private Random rand; private int errors, missed; private JoinBytesInt v; private int pid; // job side delimiter private String delim;// private String missedids; public void configure (JobConf job) { delim = job.get("delimiter"); pid = job.getInt("mapred.task.partition", 0); rand = new Random(pid + 1); v = new JoinBytesInt(); errors = 0; missed = 0;// missedids = ""; } public void close () { log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed"); } @Override public void reduce(LongWritable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { v.clear(); while (values.hasNext()) { v.add(values.next()); } if (0!=v.ulen) { if (v.refs > 0) { Text value = new Text( new String(v.url) + delim + v.refs + delim + (rand.nextInt(99) + 1) ); output.collect( key, value); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8+value.getLength()); } else { missed++; } } else { errors++; } } } private void createRankingsTableDirectly() throws IOException, URISyntaxException { log.info("Creating table rankings..."); Path fout = new Path(options.getResultPath(), RANKINGS); JobConf job = new JobConf(HiveData.class); String jobname = "Create rankings"; /** TODO: change another more effective way as this operation may cause * about 2 min delay (originally ~15min in total) */ setRankingsOptions(job); job.setJobName(jobname); job.set("mapred.reduce.slowstart.completed.maps", "0.3"); job.set("mapreduce.job.reduce.slowstart.completedmaps", "0.3"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(JoinBytesInt.class); job.setJarByClass(DummyToRankingsMapper.class); job.setJarByClass(JoinBytesIntCombiner.class); job.setJarByClass(GenerateRankingsReducer.class); job.setMapperClass(DummyToRankingsMapper.class); job.setCombinerClass(JoinBytesIntCombiner.class); job.setReducerClass(GenerateRankingsReducer.class); if (options.getNumReds() > 0) { job.setNumReduceTasks(options.getNumReds()); } else { job.setNumReduceTasks(Utils.getMaxNumReds()); } job.setInputFormat(NLineInputFormat.class); FileInputFormat.setInputPaths(job, dummy.getPath()); job.set("mapred.map.output.compression.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK"); MapFileOutputFormat.setCompressOutput(job, true);// MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.LzoCodec.class); MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.DefaultCodec.class); if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type","BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " +jobname); log.info("Pages file " + dummy.getPath() + " as input"); log.info("Rankings file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); } /*** * Mapper to randomly create user visits. In map step, only the target * urls of user visits are created, the rest content of visits will be * created in reduce step * @author lyi2 * */ public static class DummyToAccessNoMapper extends MapReduceBase implements Mapper { private JoinBytesInt vitem; private long pages; private long slots; private long visits; // job side delimiter private String delim; private Visit visit; public void configure (JobConf job) { try { pages = job.getLong("pages", 0); slots = job.getLong("slots", 0); visits = job.getLong("visits", 0); delim = job.get("delimiter"); visit = new Visit(DistributedCache.getLocalCacheFiles(job), delim, pages); vitem = new JoinBytesInt(); vitem.refs = 1; } catch (IOException e) { e.printStackTrace(); } } @Override public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); visit.fireRandom(slotId); for (long i=slotId; i<=visits;) { // simply setting url id is fine in map step key.set(visit.nextUrlId()); output.collect(key, vitem); i = i + slots; } } } public static class SequenceRankingsToUrlsMapper extends MapReduceBase implements Mapper { public JoinBytesInt uitem; public void configure(JobConf job) { uitem = new JoinBytesInt();// getBasicOptions(job); } @Override public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { uitem.url= value.toString().split(",")[0].getBytes(); uitem.ulen = (byte) uitem.url.length; output.collect(key, uitem); } } public static class TextRankingsToUrlsMapper extends MapReduceBase implements Mapper { public JoinBytesInt uitem; public void configure(JobConf job) { uitem = new JoinBytesInt();// getBasicOptions(job); } @Override public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String[] items = value.toString().split("[,\t]"); key.set(Long.parseLong(items[0])); uitem.url= items[1].getBytes(); uitem.ulen = (byte) uitem.url.length; output.collect(key, uitem); } } public static class CreateUserVisitsReducer extends MapReduceBase implements Reducer { private static final Log log = LogFactory.getLog(CreateUserVisitsReducer.class.getName()); private long pages; private Visit visit; private int errors, missed; private JoinBytesInt vitem; // job side delimiter private String delim; private int pid; public void configure (JobConf job) { try { pages = job.getLong("pages", 0); delim = job.get("delimiter"); pid = job.getInt("mapred.task.partition", 0); visit = new Visit(DistributedCache.getLocalCacheFiles(job), delim, pages); visit.fireRandom(pid + 1); vitem = new JoinBytesInt(); errors = 0; missed = 0; } catch (IOException e) { e.printStackTrace(); } } public void close () { log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed"); } /** * Reduce: to sum up the record sizes (of slots) one by one so that to determine the * corresponding start point to hold the records for each slot. */ @Override public void reduce(LongWritable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { vitem.clear();// StringBuffer sb = new StringBuffer("Reduce: " + v.toString()); while (values.hasNext()) { vitem.add(values.next());// sb.append("-> " + v.toString()); }// log.info(sb); if (0!=vitem.ulen) { if (vitem.refs > 0) { for (int i=0; i 0) { job.setNumReduceTasks(options.getNumReds()); } else { job.setNumReduceTasks(Utils.getMaxNumReds()); }// job.setNumReduceTasks(options.slots/2); if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type","BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " +jobname); log.info("Dummy file " + dummy.getPath() + " as input"); log.info("Rankings file " + rankings + " as input"); log.info("Ouput file " + fout); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); } public void generate() throws Exception { log.info("Generating hive data files..."); init(); createRankingsTableDirectly(); createUserVisitsTableDirectly(); close(); } public void loadFiles() throws IOException { RawData.createSearchKeys(new Path(options.getWorkPath(), searchkeyf)); RawData.createUserAgents(new Path(options.getWorkPath(), uagentf)); RawData.createCCodes(new Path(options.getWorkPath(), countryf)); } private void init() throws IOException { log.info("Initializing hive date generator..."); Utils.checkHdfsPath(options.getResultPath(), true); Utils.checkHdfsPath(options.getWorkPath(), true); loadFiles(); Utils.serialLinkZipf(options); dummy = new Dummy(options.getWorkPath(), options.getNumMaps()); } public void close() throws IOException { log.info("Closing hive data generator..."); Utils.checkHdfsPath(options.getWorkPath()); }}