Serde - JsonSerde

μ„œλ°(SerDe, Serializer/Deserialaizer)λŠ” ν•˜μ΄λΈŒκ°€ 데이터λ₯Ό ν•΄μ„ν•˜λŠ” 방법을 μ œκ³΅ν•œλ‹€. 예λ₯Ό λ“€μ–΄ ν•˜λ‘‘μ— Json νŒŒμΌμ„ λ”°λ‘œ μ „μ²˜λ¦¬ν•˜μ§€ μ•Šκ³  μ„œλ°λ₯Ό μ΄μš©ν•˜μ—¬ λ°”λ‘œ 집어 λ„£μœΌλ©΄ ν•˜μ΄λΈŒμ—μ„œ Row 포맷으둜 λ³€ν™˜ν•˜μ—¬ 읽을 수 있게 λœλ‹€.

ν•˜μ΄λΈŒλŠ” μ„œλ°μ™€ 파일 포맷을 μ΄μš©ν•˜μ—¬ 데이터λ₯Ό 읽고, μ“Έ 수 μžˆλ‹€.

μ•„λž˜λŠ” ν•˜μ΄λΈŒκ°€ μ„œλ°λ₯Ό μ΄μš©ν•˜μ—¬ νŒŒμΌμ„ 읽을 λ•Œμ™€ μ“Έ λ•Œμ˜ μ²˜λ¦¬λ°©λ²•μ΄λ‹€.

  • 파일 읽을 λ•Œ HDFS files -> InputFileFormat -> [key, value] -> Deserializer -> Row object
  • 파일 μ“Έ λ•Œ Row object -> Serializer -> [key, value] -> OutputFileFormat -> HDFS files


ν•˜μ΄λΈŒ κΈ°λ³Έ μ„œλ°

ν•˜μ΄λΈŒμ—μ„œ μ œκ³΅ν•˜λŠ” κΈ°λ³Έ μ„œλ°λŠ” 7가지(Avro, ORC, RegEx, Thrift, Parquet, CSV, JsonSerDe)κ°€ μžˆλ‹€. 각 μ„œλ°λŠ” STORED AS 에 μ§€μ •ν•˜λŠ” 파일의 포맷에 따라 μžλ™μœΌλ‘œ μ„ νƒλœλ‹€.

κ°€μž₯ 많이 μ“°λŠ” λ°˜μ •ν˜• 데이터 ν˜•μ‹μΈ Json을 ν•˜λ‘‘μ— λ„£κ³  μ„œλ°λ₯Ό μ΄μš©ν•˜μ—¬ ν•˜μ΄λΈŒλ‘œ μ‘°νšŒν•΄λ³΄μž.

넣을 λ°μ΄ν„°λŠ” μ•„λž˜μ™€ κ°™λ‹€.

# localdata.json

{
    "result": {
        "header": {
            "columns": [
                {
                    "rowNum": "번호",
                    "opnSfTeamCode": "κ°œλ°©μžμΉ˜λ‹¨μ²΄μ½”λ“œ",
                    "mgtNo": "κ΄€λ¦¬λ²ˆν˜Έ",
                    "opnSvcId": "κ°œλ°©μ„œλΉ„μŠ€ID",
                    "updateGbn": "데이터갱신ꡬ뢄",
                    "updateDt": "λ°μ΄ν„°κ°±μ‹ μΌμž",
                    "opnSvcNm": "κ°œλ°©μ„œλΉ„μŠ€λͺ…",
                    "bplcNm": "사업μž₯λͺ…",
                    "sitePostNo": "μ§€λ²ˆμš°νŽΈλ²ˆν˜Έ",
                    "siteWhlAddr": "μ§€λ²ˆμ£Όμ†Œ",
                    "rdnPostNo": "λ„λ‘œλͺ…μš°νŽΈλ²ˆν˜Έ",
                    "rdnWhlAddr": "λ„λ‘œλͺ…μ£Όμ†Œ",
                    "siteArea": "μ†Œμž¬μ§€λ©΄μ ",
                    "apvPermYmd": "μΈν—ˆκ°€μΌμž",
                    "apvCancelYmd": "μΈν—ˆκ°€μ·¨μ†ŒμΌμž",
                    "dcbYmd": "νμ—…μΌμž",
                    "clgStdt": "νœ΄μ—…μ‹œμž‘μΌμž",
                    "clgEnddt": "νœ΄μ—…μ’…λ£ŒμΌμž",
                    "ropnYmd": "μž¬κ°œμ—…μΌμž",
                    "trdStateGbn": "μ˜μ—…μƒνƒœμ½”λ“œ",
                    "trdStateNm": "μ˜μ—…μƒνƒœλͺ…",
                    "dtlStateGbn": "μƒμ„Έμ˜μ—…μƒνƒœμ½”λ“œ",
                    "dtlStateNm": "μƒμ„Έμ˜μ—…μƒνƒœλͺ…",
                    "x": "μ’Œν‘œμ •λ³΄(X)",
                    "y": "μ’Œν‘œμ •λ³΄(Y)",
                    "lastModTs": "μ΅œμ’…μˆ˜μ •μΌμž",
                    "uptaeNm": "μ—…νƒœκ΅¬λΆ„λͺ…",
                    "siteTel": "μ „ν™”λ²ˆν˜Έ"
                }
            ],
            "paging": {
                "pageIndex": 1,
                "totalCount": 7360,
                "pageSize": 50
            },
            "process": {
                "code": "00",
                "message": "정상 μ²˜λ¦¬λ˜μ—ˆμŠ΅λ‹ˆλ‹€."
            }
        },
        "body": {
            "rows": [
                {
                    "row": [
                        {
                            "rowNum": 1,
                            "opnSfTeamCode": 3740000,
                            "mgtNo": "PHMD120213740032084000004",
                            "opnSvcId": "01_01_06_P",
                            "updateGbn": "I",
                            "updateDt": "2021-07-04 00:22:53.0",
                            "opnSvcNm": "μ•½κ΅­",
                            "bplcNm": "ν•˜λ‚˜μ˜¨λˆ„λ¦¬μ•½κ΅­",
                            "sitePostNo": "",
                            "siteWhlAddr": "경기도 μˆ˜μ›μ‹œ κΆŒμ„ κ΅¬ μ„Έλ₯˜λ™ 1155-9 μ„Έλ₯˜λΉ„μ¦ˆνƒ€μ›Œ",
                            "rdnPostNo": 16660,
                            "rdnWhlAddr": "경기도 μˆ˜μ›μ‹œ κΆŒμ„ κ΅¬ κ²½μˆ˜λŒ€λ‘œ 193, μ„Έλ₯˜λΉ„μ¦ˆνƒ€μ›Œ 1μΈ΅ 105호 (μ„Έλ₯˜λ™)",
                            "siteArea": "",
                            "apvPermYmd": 20210712,
                            "apvCancelYmd": "",
                            "dcbYmd": "",
                            "clgStdt": "",
                            "clgEnddt": "",
                            "ropnYmd": "",
                            "trdStateGbn": "01",
                            "trdStateNm": "μ˜μ—…/정상",
                            "dtlStateGbn": 13,
                            "dtlStateNm": "μ˜μ—…μ€‘",
                            "x": "201454.508790129    ",
                            "y": "415838.18218489     ",
                            "lastModTs": 20210702162433,
                            "uptaeNm": "",
                            "siteTel": ""
                        },
                        {
                            "rowNum": 2,
                            "opnSfTeamCode": 4180000,
                            "mgtNo": 4180000202189,
                            "opnSvcId": "11_50_03_P",
                            "updateGbn": "I",
                            "updateDt": "2021-07-04 00:22:53.0",
                            "opnSvcNm": "행정사업",
                            "bplcNm": "μœ λ„ν˜„ 행정사 μ‚¬λ¬΄μ†Œ",
                            "sitePostNo": "",
                            "siteWhlAddr": "강원도 μΆ˜μ²œμ‹œ 퇴계동 ****",
                            "rdnPostNo": 24380,
                            "rdnWhlAddr": "강원도 μΆ˜μ²œμ‹œ λ‚¨μΆ˜μ²œμƒˆκΈΈ *(퇴계동)",
                            "siteArea": "",
                            "apvPermYmd": 20210708,
                            "apvCancelYmd": "",
                            "dcbYmd": "",
                            "clgStdt": "",
                            "clgEnddt": "",
                            "ropnYmd": "",
                            "trdStateGbn": "01",
                            "trdStateNm": "μ˜μ—…/정상",
                            "dtlStateGbn": "01",
                            "dtlStateNm": "μ˜μ—…μ€‘",
                            "x": "264546.296706142    ",
                            "y": "484486.499826113    ",
                            "lastModTs": 20210702151631,
                            "uptaeNm": "",
                            "siteTel": "033-264-1238"
                        },
                        ...
                    ]
                }
            ]
        }
    }
}

μœ„λŠ” json 데이터λ₯Ό μ •λ ¬μ‹œμΌœ λ†“μ•˜μ§€λ§Œ, hadoop에 μ‹€μ œ λ“€μ–΄κ°€λŠ” json은 μ •λ ¬ν•˜μ§€ 말고 ν•œμ€„λ‘œ λ“€μ–΄κ°€μ•Ό ν•œλ‹€.

json serdeλ₯Ό μ΄μš©ν•œ create문은 μ•„λž˜μ™€ κ°™λ‹€.

create table localdata (
	result struct<
		header: struct<
			columns: array<
				struct<
					rowNum: string,
					opnSfTeamCode: string,
					mgtNo: string,
					opnSvcId: string,
					updateGbn: string,
					updateDt: string,
					opnSvcNm: string,
					bplcNm: string,
					sitePostNo: string,
					siteWhlAddr: string,
					rdnPostNo: string,
					rdnWhlAddr: string,
					siteArea: string,
					apvPermYmd: string,
					apvCancelYmd: string,
					dcbYmd: string,
					clgStdt: string,
					clgEnddt: string,
					ropnYmd: string,
					trdStateGbn: string,
					trdStateNm: string,
					dtlStateGbn: string,
					dtlStateNm: string,
					x: string,
					y: string,
					lastModTs: string,
					uptaeNm: string,
					siteTel: string
				>
			>,
			paging: struct<
				pageIndex: string,
				totalCount: string,
				pageSize: string
			>,
			process: struct<
				code: string,
				message: string
			>
		>,
		body: struct<
		 	`rows`: array<
		 		struct<
		 			`row`: array<
		 				struct<
							rowNum: string,
							opnSfTeamCode: string,
							mgtNo: string,
							opnSvcId: string,
							updateGbn: string,
							updateDt: string,
							opnSvcNm: string,
							bplcNm: string,
							sitePostNo: string,
							siteWhlAddr: string,
							rdnPostNo: string,
							rdnWhlAddr: string,
							siteArea: string,
							apvPermYmd: string,
							apvCancelYmd: string,
							dcbYmd: string,
							clgStdt: string,
							clgEnddt: string,
							ropnYmd: string,
							trdStateGbn: string,
							trdStateNm: string,
							dtlStateGbn: string,
							dtlStateNm: string,
							x: string,
							y: string,
							lastModTs: string,
							uptaeNm: string,
							siteTel: string
						>
					>
		 		>
		 	>
		>
	>
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
  '/user/hive/st/localdata'
;

ν…Œμ΄λΈ”μ„ μƒμ„±ν–ˆμœΌλ©΄ hadoop에 데이터λ₯Ό λ„£μž.

$ hadoop fs -put localdata.json /user/hive/st/localdata

λ°μ΄ν„°μ—μ„œ body만 쑰회λ₯Ό ν•΄λ³΄μž.

select
	msgarr.rowNum,
	msgarr.opnSfTeamCode,
	msgarr.mgtNo,
	msgarr.opnSvcId,
	msgarr.updateGbn,
	msgarr.updateDt,
	msgarr.opnSvcNm,
	msgarr.bplcNm,
	msgarr.sitePostNo,
	msgarr.siteWhlAddr,
	msgarr.rdnPostNo,
	msgarr.rdnWhlAddr,
	msgarr.siteArea,
	msgarr.apvPermYmd,
	msgarr.apvCancelYmd,
	msgarr.dcbYmd,
	msgarr.clgStdt,
	msgarr.clgEnddt,
	msgarr.ropnYmd,
	msgarr.trdStateGbn,
	msgarr.trdStateNm,
	msgarr.dtlStateGbn,
	msgarr.dtlStateNm,
	msgarr.x,
	msgarr.y,
	msgarr.lastModTs,
	msgarr.uptaeNm,
	msgarr.siteTel
from localdata ld lateral view explode(result.body.`rows`.`row`) msgtable as msgarr
;

μ΄λŠ” μ•„λž˜μ™€ 같이 κ²°κ³Όκ°€ λ‚˜μ˜¬ 것이닀.

50개의 데이터가 각 컬럼의 배열에 담겨 ν•œ row둜 λ‚˜μ˜¨λ‹€.

각각의 row둜 데이터λ₯Ό 보고싢을땐 μ•„λž˜μ™€ 같이 ν•˜λ©΄ λœλ‹€.

select
	msgarr2.rowNum,
	msgarr2.opnSfTeamCode,
	msgarr2.mgtNo,
	msgarr2.opnSvcId,
	msgarr2.updateGbn,
	msgarr2.updateDt,
	msgarr2.opnSvcNm,
	msgarr2.bplcNm,
	msgarr2.sitePostNo,
	msgarr2.siteWhlAddr,
	msgarr2.rdnPostNo,
	msgarr2.rdnWhlAddr,
	msgarr2.siteArea,
	msgarr2.apvPermYmd,
	msgarr2.apvCancelYmd,
	msgarr2.dcbYmd,
	msgarr2.clgStdt,
	msgarr2.clgEnddt,
	msgarr2.ropnYmd,
	msgarr2.trdStateGbn,
	msgarr2.trdStateNm,
	msgarr2.dtlStateGbn,
	msgarr2.dtlStateNm,
	msgarr2.x,
	msgarr2.y,
	msgarr2.lastModTs,
	msgarr2.uptaeNm,
	msgarr2.siteTel
from localdata ld lateral view explode(result.body.`rows`.`row`) msgtable as msgarr
                    LATERAL VIEW explode(msgarr) msgtable2 AS msgarr2
;

κ²°κ³ΌλŠ” μ•„λž˜μ™€ κ°™λ‹€.

데이터 총 개수λ₯Ό κ΅¬ν•˜λŠ” 쿼리문은 μ•„λž˜μ™€ κ°™λ‹€.

select
	size(msgarr)
from localdata ld lateral view explode(result.body.`rows`.`row`) msgtable as msgarr
;

Leave a comment