[Issue ported from https://github.com/bixolabs/cascading.avro/issues/4]
We have a situation with complex CoGroup joins and GroupBy where sometimes the input directory is empty for one or more sources. Previously before converting to Avro we could point Cascading to an empty directory and simply get 0 tuples back but no error. Now we get the error:
java.lang.IllegalStateException: scheme cannot be generated as no input file present!!
at com.icrossing.collection.cascading.avro.AvroTest.getAvroScheme(AvroTest.java:140)
at com.icrossing.collection.cascading.avro.AvroTest.testForSortOnString(AvroTest.java:92)
Cascading.Avro has all the information necessary to generate a Scheme since we are passing the fields and data types in upon construction. Also, this is not backward compatible with Cascading behavior. I see two immediate alternatives to resolve the: either always create an output file with metadata whenever a flow runs, or create a virtual Scheme in the case no input file exists.
Test case:
package com.icrossing.collection.cascading.avro;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.FsInput;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.junit.After;
import org.junit.Test;
import com.bixolabs.cascading.avro.AvroScheme;
import cascading.CascadingTestCase;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.pipe.CoGroup;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.OuterJoin;
import cascading.scheme.Scheme;
import cascading.scheme.TextDelimited;
import cascading.tap.Lfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
public class AvroTest
extends CascadingTestCase
{
private static final String DIR_NAME = "./testdata/";
private static final String SOURCE_FILE_EMP = DIR_NAME + "emp/part-00000.avro";
private static final String OUTPUT_DIR = DIR_NAME + "output";
private static final String SOURCE_FILE_MGMT = DIR_NAME + "mgmt/part-00000.avro";
private static final Logger LOG = Logger.getLogger(AvroTest.class);
private static final Fields EMP_FIELDS = new Fields("e_id", "e_salary");
private static final Fields MGMT_FIELDS = new Fields("m_id", "m_designation");
public void setUpData()
throws Exception
{
createDirectory(DIR_NAME);
createDirectory(DIR_NAME + "emp/");
createDirectory(DIR_NAME + "mgmt/");
Class[] empTypes = new Class[] { Integer.class, Long.class };
Tap empSink = new Lfs(new AvroScheme(EMP_FIELDS, empTypes), DIR_NAME + "emp/");
TupleEntryCollector out = empSink.openForWrite(new JobConf());
TupleEntry t = new TupleEntry(EMP_FIELDS, new Tuple(new Object[empTypes.length]));
t.set("e_id", 1);
t.set("e_salary", 320000L);
out.add(t);
out.close();
}
@after
public void tearDown()
throws Exception
{
deleteFile(SOURCE_FILE_EMP);
deleteDirectoryRecursively(DIR_NAME);
}
@test
public void testForSortOnString()
throws Exception
{
setUpData();
/**
* get AVRO scheme from created avro file...
*/
Scheme empScheme =
getAvroScheme(new Path(SOURCE_FILE_EMP), new Configuration(), EMP_FIELDS);
/**
* avro file does not exist
*/
Scheme mgmtScheme =
getAvroScheme(new Path(SOURCE_FILE_MGMT), new Configuration(), MGMT_FIELDS);
Tap emp = new Lfs(empScheme, SOURCE_FILE_EMP);
Tap mgmt = new Lfs(mgmtScheme, SOURCE_FILE_MGMT);
Map<String, Tap> source = new HashMap<String, Tap>();
source.put("emp", emp);
source.put("mgmt", mgmt);
Tap sink = new Lfs(new TextDelimited(Fields.ALL, "\t"), OUTPUT_DIR, true);
Pipe empPipe = new Pipe("emp");
Pipe mgmtPipe = new Pipe("mgmt");
Pipe assembly =
new CoGroup(empPipe, new Fields("e_id"), mgmtPipe, new Fields("m_id"), new OuterJoin());
Flow flow = new FlowConnector().connect(source, sink, assembly);
flow.complete();
TupleEntryIterator tupleEntryIteratorInvalid = sink.openForRead(flow.getJobConf());
LOG.info("Result : ");
while (tupleEntryIteratorInvalid.hasNext())
{
LOG.info(tupleEntryIteratorInvalid.next().getTuple());
}
}
/**
- THis method will get the avro schme from the .avro file...
- @param filePath
- @param conf
- @param selectorFieldList
- @return
*/
private static Scheme getAvroScheme(Path filePath, Configuration conf, Fields selectorFieldList)
{
Scheme outputScheme = null;
boolean isFilePresent = false;;
try
{
FileSystem dfs = FileSystem.get(conf);
if (dfs.isFile(filePath))
{
isFilePresent = true;
outputScheme = readAvroFileToGetScheme(filePath, conf, selectorFieldList);
}
if (!isFilePresent)
{
LOG.error("scheme cannot be generated as no input file present!!");
throw new IllegalStateException(
"scheme cannot be generated as no input file present!!");
}
}
catch (IOException e)
{
LOG.warn("Error retrieving avro part file", e);
}
return outputScheme;
}
/**
-
@param avroFilePath
-
@param conf
-
@param selectorFieldList
-
@return
-
@throws IOException
*/
@SuppressWarnings("unchecked")
private static Scheme readAvroFileToGetScheme(Path avroFilePath, Configuration conf,
Fields selectorFieldList)
throws IOException
{
DataFileReader dataFileReader = null;
try
{
dataFileReader =
new DataFileReader(new FsInput(avroFilePath, conf),
new GenericDatumReader());
Schema schema = dataFileReader.getSchema();
Comparable fieldNames[] = new Comparable[selectorFieldList.size()];
Class types[] = new Class[selectorFieldList.size()];
for (int fieldCount = 0; fieldCount < schema.getFields().size(); fieldCount++)
{
Field field = schema.getFields().get(fieldCount);
for (int reqFieldCount = 0; reqFieldCount < selectorFieldList.size(); reqFieldCount++)
{
if (StringUtils.equals(selectorFieldList.get(reqFieldCount).toString(), field
.name()))
{
fieldNames[reqFieldCount] = field.name();
types[reqFieldCount] = getClass(field.schema().getTypes().get(1).getName());
}
}
}
Fields fields = new Fields(fieldNames);
return new AvroScheme(fields, types);
}
finally
{
dataFileReader.close();
}
}
/**
- @param dataType
- @return
*/
private static Class getClass(String dataType)
{
Class type = null;
if (StringUtils.equals("int", dataType))
{
type = Integer.class;
}
else if (StringUtils.equals("long", dataType))
{
type = Long.class;
}
return type;
}
private boolean createDirectory(String dirName)
throws IOException
{
boolean isSuccessful = false;
File directory = new File(dirName);
isSuccessful = directory.mkdirs();
return (isSuccessful);
}
private boolean deleteFile(String fileName)
throws IOException
{
boolean isSuccessful = false;
File file = new File(fileName);
if (file.exists()) isSuccessful = file.delete();
return (isSuccessful);
}
private boolean deleteDirectoryRecursively(String dirName)
throws IOException
{
if (dirName == null)
{
return false;
}
File directory = new File(dirName);
if (directory.isDirectory())
{
String[] files = directory.list();
for (int i = 0; i < files.length; i++)
{
File child = new File(dirName + File.separator + files[i]);
if (child.isDirectory())
{
deleteDirectoryRecursively(dirName + File.separator + files[i]);
}
else
{
boolean success = deleteFile(dirName + File.separator + files[i]);
if (!success) return false;
}
}
}
return (directory.delete());
}
}