challenge06.bib
@ARTICLE{Editorial:Challenge06_old,
AUTHOR = {Luc Moreau and
Bertram Lud\"ascher and
Ilkay Altintas and
Roger S. Barga and
Shawn Bowers and
Steven Callahan and
George {Chin Jr.} and
Ben Clifford and
Shirley Cohen and
Sarah Cohen-Boulakia and
Susan Davidson and
Ewa Deelman and
Luciano Digiampietri and
Ian Foster and
Juliana Freire and
James Frew and
Joe Futrelle and
Tara Gibson and
Yolanda Gil and
Carole Goble and
Jennifer Golbeck and
Paul Groth and
David A. Holland and
Sheng Jiang and
Jihie Kim and
David Koop and
Ales Krenek and
Timothy McPhillips and
Gaurang Mehta and
Simon Miles and
Dominic Metzger and
Steve Munroe and
Jim Myers and
Beth Plale and
Norbert Podhorszki and
Varun Ratnakar and
Emanuele Santos and
Carlos Scheidegger and
Karen Schuchardt and
Margo Seltzer and
Yogesh L. Simmhan and
Claudio Silva and
Peter Slaughter and
Eric Stephan and
Robert Stevens and
Daniele Turi and
Huy Vo and
Mike Wilde and
Jun Zhao and
Yong Zhao
},
TITLE = {{The First Provenance Challenge}},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
EUPUB = {yes},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {The first Provenance Challenge was set up in order to provide a forum for the community to help understand the capabilities of different provenance systems
and the expressiveness of their provenance representations.
To this end, a Functional
Magnetic Resonance Imaging workflow was defined, which participants
had to either simulate or run in order to produce some provenance
representation, from which a set of identified queries had to be
implemented and executed. Sixteen teams responded to the
challenge, and submitted their inputs. In this paper, we present
the challenge workflow and queries, and summarise the participants
contributions.}
}
@ARTICLE{SCM:Challenge06,
AUTHOR = {Karen Schuchardt and Tara Gibson and Eric Stephan and George
{Chin, Jr.}},
TITLE = {Applying Content Management to Automated Provenance Capture},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
OPTNUMBER = {},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Systems science research uses a mixture of experiments, theoretical
computations, and simulations, often augmented by further analyses,
to solve complex problems. In many cases, these processes are
conducted manually and often by multiple people and computing
systems resulting in an incomplete record of results. Scientific
workflow tools are beginning to be employed as a means of executing
and repeating processing. Adding automatic provenance capture to
workflow tools can result in complete, accurate records of data
history as well as enable more efficient, robust workflow
environments. Our goal in addressing the provenance challenge was
to combine, and as necessary, extend a variety of standard
technologies, protocols, and schemas to implement workflow
provenance and data capture, answer the challenge queries, and
explore a general architecture for scientific provenance
capture. Our implementation applies a scientific content management
system for provenance and data persistence, RDF over HTTP for a
provenance API, and our own semantic query language based on the
DAV Searching and Locating protocol. Our implementation offers
several unique capabilities, and through the use of standards, is
able to accommodate a variety of widely available client tools
against the provenance record.}
}
@ARTICLE{REDUX:Challenge06,
AUTHOR = {Roger S. Barga and Luciano A. Digiampietri},
TITLE = {Automatic Capture and Efficient Storage of eScience Experiment Provenance},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Workflow is playing an increasingly important role in conducting
e-Science experiments, but most commercial systems lack the
necessary support for the collection and management of
provenance data. We argue that eScience provenance data
should be automatically generated by the workflow enactment
engine and managed over time by an underlying storage
service. In this paper, we introduce a layered model for
workflow execution provenance, which allows navigation from
an abstract model of the experiment to instance data
collected during a specific experiment run. We outline
modest extensions to a commercial workflow engine so it will
automatically capture this provenance data at runtime. We
then present an approach to store this provenance data in a
relational database engine. Finally, we identify important
properties of provenance data captured by our model that can
significantly reduce the amount of storage required, and
demonstrate we can reduce the size of provenance data
captured from an actual experiment to 0.4\% of the original
size, with modest performance overhead.}
}
@ARTICLE{COMAD:Challenge06,
AUTHOR = {Shawn Bowers and Timothy M. McPhillips and Bertram Lud\"ascher},
TITLE = {Provenance in Collection-Oriented Scientific Workflows},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Scientific workflows frequently operate over nested collections of
data. These collections are often produced by workflow steps,
e.g., where one actor outputs a collection of data items (such as a
list of transcription factors), which is read by another actor that
produces a nested collection for each item (such as a list of
functions associated with each transcription factor). As a result,
data flow becomes increasingly nested, requiring workflows to
implement complex data management tasks. In previous work, we have
proposed a framework for transparently supporting nested data
collections in scientific workflows. Our framework provides a
number of advantages, including simpler workflow designs (compared
to conventional approaches), the ability to concurrently execute
actors over collection contents, on-the-fly customization of actor
behavior, and improved handling of workflow exceptions.
In this paper, we describe a provenance model tailored to
collection-oriented workflows, in which only a minimal number of
provenance events are required to recreate data dependencies and
process details. We also describe an implementation in Kepler for
(semi-) automatically capturing this provenance information. Our
implementation embeds provence events as tokens directly within a
data stream, and produces self-contained trace files for workflow
runs. Finally, we describe a prototype provenance reasoning and
query engine for collection-oriented traces, and demonstrate our
approach using the workflow and queries of the Provenance
Challenge.}
}
@ARTICLE{RWS:Challenge06,
AUTHOR = {Bertram Lud\"ascher and Norbert Podhorszki and Ilkay Altintas and
Shawn Bowers and Timothy M. McPhillips},
TITLE = {From Computation Models to models of Provenance: the RWS Approach},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Scientific workflows often either require or can take benefit from
the use of complex modeling constructs such as sub-workflow
nesting, cycles for executing loops, and pipelined
execution. However, for such workflows, it is not obvious how to
capture, represent, and query associated provenance information.
The Kepler Provenance Recorder provides an extensible framework for
capturing provenance information in actor-oriented scientific
workflows. The primary application of the Provenance Recorder so
far, however, has been for implementing ``smart'' re-run, where
previous workflow runs are reused to optimize future runs with
different input data or parameter settings. Alternatively, the
Read, Write, State-reset (RWS) provenance model is designed to
capture, and subsequently query, detailed data and invocation
dependencies in scientific workflows. The RWS model is designed to
explicitly support scientific workflows using pipeline parallelism
over streaming data as well as cycles. This paper describes an
implementation of the RWS model within Kepler, including the
required extensions to the Kepler Provenance Recorder. We also
describe additional extensions to the RWS model for capturing
nested workflows, in which workflow components (actors) represent
sub-workflows. Finally, we present examples from the provenance
challenge that highlight the capabilities of our approach.}
}
@ARTICLE{Zoom:Challenge06,
AUTHOR = {Sarah Cohen-Boulakia and Olivier Biton and Shirley Cohen and Susan Davidson},
TITLE = {Addressing the Provenance challenge using ZOOM},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {ZOOM*UserViews presents a formal model of provenance for scientific workflows that is simple, generic, and yet sufficiently expressive to answer questions of data and step provenance
that have been encountered in a large variety of scientific case studies.
In addition, ZOOM builds on the concept of composite step-classes -- or
sub-workflows -- which is present in many scientific workflow systems
to develop a notion of user views. This paper discusses the design and
implementation of ZOOM in the context of the queries posed by the
provenance challenge, and shows how user views affect the level of
granularity at which provenance information can be seen and reasoned about.}
}
@ARTICLE{Karma:Challenge06,
AUTHOR = {Yogesh L. Simmhan and Beth Plale and Dennis Gannon},
TITLE = {Querying Capabilities of the Karma Provenance Framework},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
PAGES = {441--451},
DOI = {10.1002/cpe.1229},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Provenance in e-Science is a form of metadata capturing the
derivation history of data products and scientific
workflows. Provenance forms a glue linking workflow executions with
associated data products, and finds use in determining the quality
of derived data, tracking resource usage, verifying and validating
scientific experiments, and for different forms information
discovery through querying and mining. In this article, we discuss
the scope of provenance collected in the Karma Provenance Framework
used in the LEAD Project, distinguishing provenance metadata from
generic annotations. We further describe our approaches to querying
for different kinds of provenance in Karma while addressing the
queries proposed in the Provenance Challenge Workshop. We use a
building-block method to construct provenance queries, with the
Karma service providing fundamental querying capabilities centered
on the provenance metadata model and client-side libraries using
those to iteratively perform complex queries. This has the
advantage of keeping the Karma service generic and simple, and yet
supports a wide range of queries. We conclude with opportunities
that we see for optimizing the Karma query interface to tackle
potentially costly deep provenance queries.}
}
@ARTICLE{MINDSWAP:Challenge06,
AUTHOR = {Jennifer Golbeck and James Hendler},
TITLE = {A Semantic Web Approach to Tracking Provenance in Scientific Workflows},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Provenance is a critical concept in scientific workflows, since it
allows scientists to understand the origin of their results, to
repeat their experiments, and to validate the processes that were
used to derive data products. When working in a online environment,
such as the Semantic Web is a natural fit for executing workflows
and producing provenance information for the process and files. In
this paper, we present our Semantic Web-based approach to the
provenance challenge. Web services execute each step of the
workflow and output files onto the web where they are represented
uniquely by URIs. The services also output RDF files that represent
metadata about their execution as well as the provenance of the
output files. When these files are aggregated, simple SPARQL can
answer all the queries in the challenge. We will also discuss how
this distributed approach contrasts with systems.}
}
@ARTICLE{JP:Challenge06,
AUTHOR = {Ales Krenek and Jiri Sitera and Ludek Matyska and Frantisek
Dvorak and Milos Mulac and Miroslav Ruda and Zdenek Salvet},
TITLE = {gLite Job Provenance -- a Job-Centric View},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Job Provenance (JP) is a Grid service that keeps long-term track on completed
computations for further reference. It is a job-centric service, keeping
records about job life cycle, environment, inputs/outputs, user parameters
etc. The data collected from the Grid middleware where the job has run can
be complemented with user annotations that add a personalized view.
JP is a part of the gLite Grid middleware developed within the EU EGEE project.
During the first provenance challenge, we explored the relation between a
specific job-centric Grid oriented provenance and a more general data
provenance approach. We demonstrated how the job-centric view of computations
can be connected with data-centric user queries. We present important design
decisions and user-level procedures used in the challenge to implement
individual prescribed scenarios. We also show how JP can store data about
complex workflows and how these data can be used to answer user queries.
The implementation of the first provenance challenge workflow in a real
production level Grid system (gLite based EGEE Grid) provides an insight
how the workflow tasks can be implemented and run on a Grid.
We conclude with ``lessons learnt'' -- the challenge represents a usecase with
emphasis in fields that were not priorities in the original JP design,
namely dealing with structured computations (workflows), and types of
annotations which are logically related to data rather than jobs.
However, we proved that the design is sufficiently general to cope with
this usage approach. We also identified several areas where it is feasible
to extend the current implementation.}
}
@ARTICLE{OPA:Challenge06,
AUTHOR = {Simon Miles and Paul Groth and Steve Munroe and Sheng Jiang
and Thibaut Assandri and Luc Moreau},
TITLE = {{Extracting Causal Graphs from an Open Provenance Data Model}},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
EUPUB = {yes},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {The open provenance architecture (OPA) approach to the challenge
was distinct in several regards. In particular, it is based on an
open, well-defined data model and architecture, allowing different
components of the challenge workflow to independently record
documentation, and for the workflow to be executed in any
environment. Another noticeable feature is that we distinguish
between the data recorded about what has occurred, \emph{process
documentation}, and the \emph{provenance} of a data item, which is
all that caused the data item to be as it is and is obtained as the
result of a query over process documentation. This distinction
allows us to tailor the system to separately best address the
requirements of recording and querying documentation. Other
notable features include the explicit recording of causal
relationships between both events and data items, an
interaction-based world model, intensional definition of data items
in queries rather than relying on explicit naming mechanisms, and
\emph{styling} of documentation to support non-functional
application requirements such as reducing storage costs or ensuring
privacy of data. In this paper we describe how each of these
features aid us in answering the challenge provenance queries.}
}
@ARTICLE{ES3:Challenge06,
AUTHOR = {James Frew and Dominic Metzger and Peter Slaughter},
TITLE = {Automatic Capture and Reconstruction of Computational Provenance},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {The Earth System Science Server (ES3) project is developing a local infrastructure for managing Earth science data products derived from satellite remote sensing. By ``local,'' we mean the infrastructure that a scientist uses to manage the creation and dissemination of her own data products, particularly those that are constantly incorporating corrections or improvements based on the scientist’s own research. Therefore, in addition to being robust and capacious enough to support public access, ES3 is intended to be flexible enough to manage the idiosyncratic computing ensembles that typify scientific research.
Instead of specifying provenance explicitly with a workflow model, ES3 extracts provenance information automatically from arbitrary applications by monitoring their interactions with their execution environment. These interactions (arguments, file I/O, system calls, etc.) are logged to the ES3 database, which assembles them into provenance graphs. These graphs resemble workflow specifications, but are really reports -- they describe what actually happened, as opposed to what was requested. The ES3 database supports forward and backward navigation through provenance graphs (i.e. ancestor/descendant queries), as well as graph retrieval.}
}
@ARTICLE{VDL:Challenge06,
AUTHOR = {Ben Clifford and Ian Foster and Mihael Hategan and Tiberiu
Stef-Praun and Michael Wilde and Yong Zhao.},
TITLE = {Tracking Provenance in a Virtual Data Grid},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {The virtual data model allows data sets to be described prior to,
and separate from, their physical materialization. Virtual data
products are described by the three dimensions of the workflow that
must be performed to materialize data sets, the runtime logs
produced by the execution of these workflows, and the metadata
annotations that permit application semantics to be associated with
the data. This model is implemented by a Virtual Data Language
(VDL) and its supporting processing tools and runtime
environment. The VDL environment enables the work of deriving data
products to be spread over a global Grid of storage and processing
services, and uses both XML and relational data models to capture
and query annotation, workflow and provenance in this widely
distributed environment. This paper describes the implementation
and data modeling aspects of these mechanisms in the context of a
standardized data provenance challenge exercise.}
}
@ARTICLE{NCSA:Challenge06,
AUTHOR = {Joe Futrelle and Jim Myers},
TITLE = {Tracking Provenance Semantics in Heterogeneous Execution Systems},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Digital artifacts result from complex, heterogeneous work processes involving content management, process execution, and curation. Accordingly, systems for tracking provenance of digital artifacts need to be able to integrate heterogeneous descriptions produced by loosely-coupled or independent software components and work processes. In the approach described in this paper, two independently-developed execution environments, D2K and CyberIntegrator, were instrumented by their developers to produce process and content descriptions in the form of Resource Description Framework (RDF) statements. Using the open-source Kowari RDF database, these heterogeneous semantic descriptions were integrated to demonstrate the general applicability of RDF databases to answering provenance-related queries. The results suggest that the ``open-world'' semantic model provided by RDF, and the powerful query languages provided by RDF databases, can be extended to integrate a wide variety of heterogeneous provenance-related information with minimal investment in new standard API's, metadata formats, and execution environments.}
}
@ARTICLE{PASS:Challenge06,
AUTHOR = {Margo Seltzer and David A. Holland and Uri Braun and Kiran-Kumar Muniswamy-Reddy},
TITLE = {PASS-ing the Provenance Challenge},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Provenance Aware Storage Systems (PASS) are a new class of storage
system treating provenance as a first-class object, providing
automatic collection, storage, and management of provenance as well
as query capabilities. We developed the first PASS prototype
between 2005 and 2006, targeting scientific end-users. Prior to
undertaking the Provenance Challenge, we had focused on provenance
collection and storage, without much emphasis on a query model or
language. The challenge forced us to (quickly) develop a query
model and infrastructure implementing this model. We present a
brief overview of the PASS prototype and a discussion of the
evolution of the query model that we developed for the challenge.}
}
@ARTICLE{Vistrail:Challenge06,
AUTHOR = {Carlos Scheidegger and David Koop and Emanuele Santos and
Huy Vo and Steven Callahan and Juliana Freire and Claudio
Silva},
TITLE = {Tackling the Provenance Challenge One Layer at a Time},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {VisTrails is a new workflow management system that provides support
for scientific data exploration and visualization. Whereas workflows
have been traditionally used to automate repetitive tasks, for
applications that are exploratory in nature, very little is
repeated---change is the norm. VisTrails uses a new change-based
provenance mechanism which was designed to manage rapidly-evolving
workflows. It uniformly and automatically captures provenance
information for data products and for the evolution of the workflows
used to generate these products. In this paper, we describe how the
provenance data is organized in layers and present a first
approach to querying these data that we developed to tackle the
Provenance Challenge queries.}
}
@ARTICLE{Wings+Pegasus:Challenge06,
AUTHOR = {Jihie Kim and Ewa Deelman and Yolanda Gil and Gaurang Mehta
and Varun Ratnakar},
TITLE = {Provenance trails in the Wings/Pegasus system},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Creation of valid scientific workflows involves keeping track of
various workflow constraints, including data independent
constraints on workflow components, data driven constraints, and
resource management constraints. We describe an approach to
workflow instantiation and refinement that uses semantic
representations of workflow constraints to 1) describe complex
scientific applications in a data-independent manner, then 2)
automatically generates workflows of computations for given data
sets, and 3) finally maps them to available computing resources. We
illustrate the provenance data generated by Wings during workflow
instantiation and the refinement provenance by the Pegasus mapping
system for execution over grid computing environments. We show how
the results are mapped to the queries of the Provenance
Challenge. }
}
@ARTICLE{myGrid:Challenge06,
AUTHOR = {Jun Zhao and Carole Goble and Robert Stevens and Daniele
Turi},
TITLE = {Mining Taverna's Semantic Web of Provenance},
JOURNAL = {Concurrency and Computation: Practice and Experience},
YEAR = {2008},
OPTKEY = {},
VOLUME = {20},
NUMBER = {5},
OPTPAGES = {},
OPTMONTH = {},
OPTNOTE = {},
OPTANNOTE = {},
ABSTRACT = {Taverna is a workflow workbench and execution environment developed as part of the UK's myGrid project. Taverna's provenance model captures: information about the origin of experimental data results collected during workflow runs; derivation paths that present a datum's lineage; an audit trail of the experiment execution leading to the data; the context of the workflow; and the evidence of the knowledge outcomes as a result of its execution. Flexible and open models are required to cater for an accumulative body of knowledge as workflows are multiply re-run, and as the same data are gathered from external repositories by different workflows. Hence we adopt the RDF graph-based data model formalism. The provenance graphs generated by workflow runs are semantically enriched with descriptions about the workflows and their products, captured during workflow design, execution, interpretation and publication. This enables context-based analysis combining origin and domain knowledge about these experimental entities. Previous work has shown how Taverna's provenance is represented using Semantic Web technologies, combining external third-party metadata with semantic annotations capturing the signatures of workflow components, leading to a ``Semantic Web of Provenance''. This paper shows how this Semantic Web of Provenance can be mined by a 5-tiered provenance usage framework, ProQA (Provenance Query and Answer). ProQA supports a wide range of provenance operations, from fine-grained provenance retrieval, to high-level provenance analysis and reasoning for supporting a collection of user scenarios. This framework is implemented as the ProQA query API, and a set of system provenance workflows that analyze experiment results using the provenance records. These provenance workflows are consistent with the experiment practice of Taverna's users, and enable the provenance of the data analysis and interpretation process to be automatically collected during the runs of these workflows. We show how these features of Taverna's provenance support us in answering the questions from the provenance challenge workshop and a set of additional provenance queries.}
}
This file has been generated by
bibtex2html 1.52