我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用pyspark.sql.HiveContext()。
def main(argv=None): args = parse_arguments(argv) if args['very_verbose']: logging.basicConfig(level=logging.DEBUG) elif args['verbose']: logging.basicConfig(level=logging.INFO) else: logging.basicConfig() del args['verbose'] del args['very_verbose'] sc = SparkContext(appName="MLR: data collection pipeline") # spark info logging is incredibly spammy. Use warn to have some hope of # human decipherable output sc.setLogLevel('WARN') sqlContext = HiveContext(sc) run_pipeline(sc, sqlContext, **args)
def setUpClass(cls): ReusedPySparkTestCase.setUpClass() cls.tempdir = tempfile.NamedTemporaryFile(delete=False) try: cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf() except py4j.protocol.Py4JError: cls.tearDownClass() raise unittest.SkipTest("Hive is not available") except TypeError: cls.tearDownClass() raise unittest.SkipTest("Hive is not available") os.unlink(cls.tempdir.name) _scala_HiveContext =\ cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc()) cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext) cls.testData = [Row(key=i, value=str(i)) for i in range(100)] cls.df = cls.sc.parallelize(cls.testData).toDF()
def orc(self, path, mode=None, partitionBy=None): """Saves the content of the :class:`DataFrame` in ORC format at the specified path. ::Note: Currently ORC support is only available together with :class:`HiveContext`. :param path: the path in any Hadoop supported file system :param mode: specifies the behavior of the save operation when data already exists. * ``append``: Append contents of this :class:`DataFrame` to existing data. * ``overwrite``: Overwrite existing data. * ``ignore``: Silently ignore this operation if data already exists. * ``error`` (default case): Throw an exception if data already exists. :param partitionBy: names of partitioning columns >>> orc_df = hiveContext.read.orc('python/test_support/sql/orc_partitioned') >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) if partitionBy is not None: self.partitionBy(partitionBy) self._jwrite.orc(path)
def main(argv=None): args = parse_arguments(argv) if args['very_verbose']: logging.basicConfig(level=logging.DEBUG) elif args['verbose']: logging.basicConfig(level=logging.INFO) else: logging.basicConfig() del args['verbose'] del args['very_verbose'] # TODO: Set spark configuration? Some can't actually be set here though, so best might be to set all of it # on the command line for consistency. sc = SparkContext(appName="MLR: training pipeline") sc.setLogLevel('WARN') sqlContext = HiveContext(sc) output_dir = args['output_dir'] if os.path.exists(output_dir): logging.error('Output directory (%s) already exists' % (output_dir)) sys.exit(1) # Maybe this is a bit early to create the path ... but should be fine. # The annoyance might be that an error in training requires deleting # this directory to try again. os.mkdir(output_dir) try: run_pipeline(sc, sqlContext, **args) except: # noqa: E722 # If the directory we created is still empty delete it # so it doesn't need to be manually re-created if not len(glob.glob(os.path.join(output_dir, '*'))): os.rmdir(output_dir) raise
def orc(self, path): """Loads an ORC file, returning the result as a :class:`DataFrame`. ::Note: Currently ORC support is only available together with :class:`HiveContext`. >>> df = hiveContext.read.orc('python/test_support/sql/orc_partitioned') >>> df.dtypes [('a', 'bigint'), ('b', 'int'), ('c', 'int')] """ return self._df(self._jreader.orc(path))
def _test(): import doctest import os import tempfile from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext, HiveContext import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['hiveContext'] = HiveContext(sc) globs['df'] = globs['sqlContext'].read.parquet('python/test_support/sql/parquet_partitioned') (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)