[Spark]파이프라인 만들기 코드 요약

Spark Bigdata Pipeline

  • 강사님이 정리해주신 코드다. 가상컴퓨터를 지웠다가 다시 만들고 설치할 때 도움이 될 것 같다.

  • ubuntu 20

  • 환경은 ubuntu20이다.

  • 계정 : big /1234

  • 자세한 설치 방법은 블로그 [Spark]복습 게시물들에 있으니 참고하자.

sudo apt update
sudo apt upgrade

sudo apt install vim -y
sudo apt install openssh-server ssh-askpass -y

ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

wget https://corretto.aws/downloads/latest/amazon-corretto-11-x64-linux-jdk.tar.gz
tar xvzf amazon-corretto-11-x64-linux-jdk.tar.gz
ln -s amazon-corretto-11.0.14.10.1-linux-x64/ java

sudo vim ~/.bashrc

# java
export JAVA_HOME=/home/big/java
export PATH=$PATH:$JAVA_HOME/bin

source ~/.bashrc

java -version
javac -version

python3 --version

sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install python3.7 -y
python3 --version

sudo vim ~/.bashrc

# python alias
alias python=python3.7
alias python3=python3.7

source ~/.bashrc
python3 --version
python --version

sudo apt install python3-pip -y

sudo vim ~/.bashrc

# python alias
alias python=python3.7
alias python3=python3.7
alias pip="python3.7 -m pip"

source ~/.bashrc


python -V
pip -V

hadoop

wget https://mirror.navercorp.com/apache/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
tar xvzf hadoop-3.3.1.tar.gz
ln -s hadoop-3.3.1 hadoop

sudo vim ~/.bashrc

# hadoop
export HADOOP_HOME=/home/big/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# hadoop user
export HDFS_NAMENODE_USER=big
export HDFS_DATANODE_USER=big
export HDFS_SECONDARYNAMENODE_USER=big
export YARN_RESOURCEMANAGER_USER=big
export YARN_NODEMANAGER_USER=big

source ~/.bashrc

cd $HADOOP_CONF_DIR
vim hadoop-env.sh

export JAVA_HOME=/home/big/java
export HADOOP_HOME=/home/big/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export  HADOOP_PID_DIR=$HADOOP_HOME/pids

vim core-site.xml

	<property>
		<name>fs.defaultFS</name>
		<value>hdfs://localhost:9000</value>
	</property>

vim hdfs-site.xml

        <property>
                <name>dfs.replication</name>
                <value>1</value>
        </property>
        <property>
                <name>dfs.namenode.name.dir</name>
                <value>/home/big/hadoop/namenode_dir</value>
        </property>
        <property>
                <name>dfs.namenode.secondary.http-address</name>
                <value>localhost:50090</value>
        </property>
        <property>
                <name>dfs.datanode.data.dir</name>
                <value>/home/big/hadoop/datanode_dir</value>
        </property>

vim mapred-site.xml

        <property>
                <name>mapreduce.framework.name</name>
                <value>yarn</value>
        </property>
hdfs namenode -format
hdfs datanode -format

# start-all.sh
start-dfs.sh
start-yarn.sh
jps

hdfs dfsadmin -report
# hadoop http -> localhost:9870
# yarn http -> localhost:8088

# stop-all.sh
stop-dfs.sh
stop-yarn.sh

spark

cd

wget https://downloads.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-without-hadoop.tgz
tar xvzf spark-3.1.3-bin-without-hadoop.tgz
ln -s spark-3.1.3-bin-without-hadoop spark

sudo vim ~/.bashrc

# spark
export SPARK_HOME=/home/big/spark 
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)

source ~/.bashrc

cd $SPARK_HOME/conf
cp workers.template workers

cp spark-env.sh.template spark-env.sh
vim spark-env.sh

export JAVA_HOME=/home/big/java
export HADOOP_HOME=/home/big/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)

export PYSPARK_PYTHON=/usr/bin/python3.7
export PYSPARK_DRIVER_PYTHON=/usr/bin/python3.7

cp spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf

spark.master						yarn

pyspark

exit()

zeppelin

cd

wget https://mirror.nodesdirect.com/apache/zeppelin/zeppelin-0.10.0/zeppelin-0.10.0-bin-netinst.tgz
tar xvzf zeppelin-0.10.0-bin-netinst.tgz
ln -s zeppelin-0.10.0-bin-netinst zeppelin

sudo vim ~/.bashrc

# zeppelin
export ZEPPELIN_HOME=/home/big/zeppelin
export PATH=$PATH:$ZEPPELIN_HOME/bin

source ~/.bashrc

cd $ZEPPELIN_HOME/conf
cp zeppelin-env.sh.template zeppelin-env.sh
vim zeppelin-env.sh

export JAVA_HOME=/home/big/java
export SPARK_HOME=/home/big/spark
export HADOOP_CONF_DIR=/home/big/hadoop/etc/hadoop

cp zeppelin-site.xml.template zeppelin-site.xml

vim zeppelin-site.xml

<property>
	<name>zeppelin.server.addr</name>
	<value>0.0.0.0</value>
</property>
<property>
	<name>zeppelin.server.port</name>
    <value>8082</value>
</property>
zeppelin-daemon.sh start

localhost:8082

spark.master				yarn
spark.submit.deployMode		client


test = [1, 2, 3, 4, 5]
testRdd = sc.parallelize(test)
testRdd.collect()

zeppelin-daemon.sh stop

MySQL

sudo apt install mysql-server -y

sudo service mysql start

sudo mysql_secure_installation
n
1234
1234
y
n
y
y

cd /etc/mysql/mysql.conf.d
sudo vim mysqld.cnf

bind-address            = 0.0.0.0
mysqlx-bind-address     = 0.0.0.0

sudo mysql -u root -p
1234
show databases;
use mysql;

alter user 'root'@'localhost' identified with mysql_native_password by '1234';
CREATE USER 'root'@'%' IDENTIFIED BY '1234';

GRANT ALL PRIVILEGES ON *.* TO 'root'@'localhost' WITH GRANT OPTION;
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;

flush privileges;

CREATE TABLE test(id int, name VARCHAR(30));

INSERT INTO test VALUES(1, "hadoop");
INSERT INTO test VALUES(2, "spark");

exit

wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java_8.0.26-1ubuntu20.04_all.deb
sudo dpkg -i mysql-connector-java_8.0.26-1ubuntu20.04_all.deb 

cd $SPARK_HOME/conf
vim spark-defaults.conf

spark.jars					/usr/share/java/mysql-connector-java-8.0.26.jar

pyspark

# pyspark에서 mysql에 접속하기 위한 변수 생성
user="root"
password="1234"
url="jdbc:mysql://localhost:3306/mysql"
driver="com.mysql.cj.jdbc.Driver"
dbtable="test"

# DB와 연결
test_df = spark.read.format("jdbc").option("user", user).option("password", password)\
.option("url", url).option("driver", driver).option("dbtable", dbtable).load()
## .options를 사용할 수도 있음
# test_df = spark.read.format("jdbc").options(user=user, password=password, url=url, driver=driver, dbtable=dbtable).load()

# 불러온 DB 데이터 확인
test_df.show()

# 새로운 데이터(RDD->DF) 만들기
test_insert = [(3, "mysql"), (4, "zeppelin")]
insert_df = sc.parallelize(test_insert).toDF(["id", "name"])
insert_df.show()

# 지정해놓은 DB의 테이블에 append(Insert)
insert_df.write.jdbc(url, dbtable, "append", properties={"driver": driver, "user": user, "password": password})

# 삽입되었는지 확인
test_df.show()

MongoDB

# 1 MongoDB HomePage - [Resources] - [Documentation] - [Server] - [Installation] - [Ubuntu]
wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | sudo apt-key add -
# 2
echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-5.0.list
# 3 패키지 업글
sudo apt-get update
sudo apt-get upgrade -y
# 4 패키지 설치
sudo apt install -y mongodb-org
# 5 시동걸기 
sudo systemctl start mongod
# sudo service mongod start

# mongodb 실행
mongo

# 테스트해보기
db.test.insert({"id":"10", "name":"mongo"})
db.test.find()

exit

# 5번에서 안될 때!
sudo vim /etc/init.d/mongodb
# https://raw.githubusercontent.com/mongodb/mongo/master/debian/init.d 의 내용 복붙
sudo chmod 755 /etc/init.d/mongodb
sudo service mongodb start

vim spark-defaults.conf

# mongodb connect
spark.mongodb.input.uri		mongodb://localhost/test
spark.mongodb.output.uri	mongodb://localhost/test
# spark.jar.packages			org.mongodb.spark:mongo-spark-connector_2.12:3.0.1

spark.jars.packages설정이 제대로 동작하지 않아서, 실행할 때 –packages 옵션으로 dependency명시

pyspark --packages org.mongodb.spark:mongo-spark-connector_2.12:3.0.1

test = spark.read.format("mongo").option("database", "test").option("collection", "test").load()
test.show()

insert_df = spark.createDataFrame([("11", "mongo-spark")],["id", "name"])
insert_df.write.format("mongo").option("database", "test").option("collection", "test").mode("append").save()

test = spark.read.format("mongo").option("database", "test").option("collection", "test").load()
test.show()


'''
spark-defaults.conf 파일에다가 해당 내용 추가하면
spark.mongodb.input.database	test
spark.mongodb.input.collection	test

spark.mongodb.output.database	test
spark.mongodb.output.collection	test

.option("database", "test").option("collection", "test") 생략 가능
-> spark.read.format("mongo").load() 
'''

2022

[web]jQuery 복습 3

1 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]jQuery 복습 2

13 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]jQuery 복습 1

14 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]JavaScript 정리4

5 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]JavaScript 정리3

10 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]JavaScript 정리2

7 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]JavaScript 정리1

8 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]CSS 기초 정리

11 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[web]HTML 기초 정리

8 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

[Pandas]pandas 연습

3 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

맨 위로 이동 ↑

2021

[Python기초]module

1 분 소요

[Noitce] 고쳐야하거나 틀린 것이 있으면 말씀해주세요!

맨 위로 이동 ↑