通过CLI工具创建、交互shell
调用aws cli 首先在终端中输入
aws configure 按照要求输入credentials.csv中的ID,key
创建EMR cluster
aws emr create-cluster --name "Spark cluster" --release-label emr-5.16.0 --applications Name=Spark \
--ec2-attributes KeyName=myKey --instance-type m4.large --instance-count 3 --use-default-roles
通过ssh与其连接
aws emr list-clusters 获取cluster id
aws emr ssh --cluster-id j-3SD91U2E1L2QX --key-pair-file ~/.ssh/mykey.pem
pyspark 进入python shell
spark-shell 进入scala shell
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@404721db
scala> val textFile = sc.textFile("s3://elasticmapreduce/samples/hive-ads/tables/impressions/dt=2009-04-13-08-05/ec2-0-51-75-39.amazon.com-2009-04-13-08-05.log")
scala> val linesWithCartoonNetwork = textFile.filter(line => line.contains("cartoonne