LearningSpark/src/main/scala/sql/Basic.scala at master · spirom/LearningSpark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package sql

import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}

//
// Define data in terms of a case class, convert it to a DataFrame,
// register it as a temporary table, and query it. Print the original DataFrame
// and the ones resulting from the queries, and see their schema.
//
object Basic {
  case class Cust(id: Integer, name: String, sales: Double, discount: Double, state: String)

  def main(args: Array[String]) {
    val spark =
      SparkSession.builder()
        .appName("SQL-Basic")
        .master("local[4]")
        .getOrCreate()

    import spark.implicits._

    // create a sequence of case class objects
    // (we defined the case class above)
    val custs = Seq(
      Cust(1, "Widget Co", 120000.00, 0.00, "AZ"),
      Cust(2, "Acme Widgets", 410500.00, 500.00, "CA"),
      Cust(3, "Widgetry", 410500.00, 200.00, "CA"),
      Cust(4, "Widgets R Us", 410500.00, 0.0, "CA"),
      Cust(5, "Ye Olde Widgete", 500.00, 0.0, "MA")
    )
    // make it an RDD and convert to a DataFrame
    val customerDF = spark.sparkContext.parallelize(custs, 4).toDF()

    println("*** See the DataFrame contents")
    customerDF.show()

    println("*** See the first few lines of the DataFrame contents")
    customerDF.show(2)

    println("*** Statistics for the numerical columns")
    customerDF.describe("sales", "discount").show()

    println("*** A DataFrame has a schema")
    customerDF.printSchema()

    //
    // Register with a table name for SQL queries
    //
    customerDF.createOrReplaceTempView("customer")

    println("*** Very simple query")
    val allCust = spark.sql("SELECT id, name FROM customer")
    allCust.show()
    println("*** The result has a schema too")
    allCust.printSchema()

    //
    // more complex query: note how it's spread across multiple lines
    //
    println("*** Very simple query with a filter")
    val californiaCust =
      spark.sql(
        s"""
          | SELECT id, name, sales
          | FROM customer
          | WHERE state = 'CA'
         """.stripMargin)
    californiaCust.show()
    californiaCust.printSchema()

    println("*** Queries are case sensitive by default, but this can be disabled")

    spark.conf.set("spark.sql.caseSensitive", "false")
    //
    // the capitalization of "CUSTOMER" here would normally make the query fail
    // with "Table not found"
    //
    val caseInsensitive =
      spark.sql("SELECT * FROM CUSTOMER")
    caseInsensitive.show()
    spark.conf.set("spark.sql.caseSensitive", "true")


  }
}