first,show,take,collect,tail的用法

本文深入解析RDD的各种操作,如first(), head(), show(), take(), collect()和tail()等方法的底层实现与应用场景。详细阐述了这些方法如何触发Spark作业执行,以及在大数据处理中可能遇到的内存溢出问题。

first() = head(1) head() 底层调用 take()

show() 底层调用的take() 多了一层类型转换 将值转化为字符串 多了truncate参数,截取字符串功能,RDD没有show方法

take() 调用 collect() + limit(),如果这里面传入的n非常大,会导致OutOfMemoryError,这个方法应该只在预期结果数组很小的情况下使用,因为所有的数据都被加载到驱动程序的内存中。由于内部实现的复杂性,如果在无或空的RDD上调用此方法,将引发异常。

collect()触发runjob()执行任务,这个方法应该只在预期结果数组很小的情况下使用,因为所有的数据都被加载到驱动程序的内存中。如果数据量很大会导致OutOfMemoryError

tail() 返回dataframe最后一行,可能会OOM

#include <stdio.h> #include <stdlib.h> #include <string.h> #include "list.h" /** * @brief 这个是内核链表的一个demo * 1. 第一自己需要的数据类型 ,其中必须包含一个 struct list_head 的变量 2. 定义头节点,并初始化 3. 增加结点,malloc自己的结构体,填入自己需要的数据 调用list_add ,把当前结点加入链表 4. 遍历所有元素list_for_each_entry_safe, */ typedef struct { int id; char name[50]; struct list_head node; } PER; int add_per(struct list_head *head, int id, char *name) { PER *per = malloc(sizeof(PER)); if (NULL == per) { perror("add_per malloc error\n"); return 1; } per->id = id; strcpy(per->name, name); //头插 // list_add(&per->node, head); list_add_tail(&per->node, head); return 0; } int show(struct list_head *head) { // 遍历所有数据, // pos 当前要访问的PER结构体指针 ,n是pos的下一个指针 , // head 链表的头结点 // member 在自定义的结构体中 结点的变量名 // list_for_each_entry_safe(pos, n, head, member) for PER *tmp; PER *next; list_for_each_entry_safe(tmp, next, head, node) { printf("%d %s\n", tmp->id, tmp->name); } return 0; } /** * @brief * * @param head * @param id 需要删除数据的编号 * @return int */ PER* find_per(struct list_head *head, char *name) { PER *tmp; list_for_each_entry(tmp, head, node) { if (strcmp(tmp->name, name) == 0) { printf("找到节点:id=%d, name=%s\n", tmp->id, tmp->name); return tmp; } } printf("未找到姓名为「%s」的节点\n", name); return NULL; } int modify_per(struct list_head *head, char *old_name, int new_id, char *new_name) { PER *target = find_per(head, old_name); if (target == NULL) { return 1; } target->id = new_id; if (strlen(new_name) >= sizeof(target->name)) { printf("无效名字"); return 2; } strcpy(target->name, new_name); printf("修改成功:原姓名「%s」→ 新id=%d, 新姓名「%s」\n", old_name, new_id, new_name); return 0; } int del_per(struct list_head *head, int id) { PER *tmp; PER *next; list_for_each_entry_safe(tmp, next, head, node) { if (tmp->id == id) { list_del(&tmp->node); free(tmp); } } return 0; } int main(int argc, char **argv) { //头结点,不包含有效数据,head->next 是链表中第一个有效数据 struct list_head head; //双向循环链表, 当前结点的prev,next 都指向自己 INIT_LIST_HEAD(&head); add_per(&head, 1, "zhagnsan"); add_per(&head, 2, "lisi"); add_per(&head, 3, "wangmazi"); add_per(&head, 4, "guanerge"); add_per(&head, 5, "liubei "); show(&head); del_per(&head, 1); printf("------------del--------------\n"); show(&head); find_per(&head, "lisi"); // system("pause"); return 0; } #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/stddef.h> #include <stdio.h> //#include <linux/poison.h> //#include <linux/prefetch.h> //#include <asm/system.h> /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ // #define LIST_POISON1 ((void *) 0x00100100) // #define LIST_POISON2 ((void *) 0x00200200) #define LIST_POISON1 ((void *) 0) #define LIST_POISON2 ((void *) 0) #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) \ { \ &(name), &(name) \ } #define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT (name) static inline void INIT_LIST_HEAD (struct list_head *list) { list->next = list; list->prev = list; } /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ #ifndef CONFIG_DEBUG_LIST static inline void __list_add (struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } #else extern void __list_add (struct list_head *new, struct list_head *prev, struct list_head *next); #endif /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add (struct list_head *new, struct list_head *head) { __list_add (new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail (struct list_head *new, struct list_head *head) { __list_add (new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del (struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ #ifndef CONFIG_DEBUG_LIST static inline void list_del (struct list_head *entry) { __list_del (entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } #else extern void list_del (struct list_head *entry); #endif /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace (struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init (struct list_head *old, struct list_head *new) { list_replace (old, new); INIT_LIST_HEAD (old); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init (struct list_head *entry) { __list_del (entry->prev, entry->next); INIT_LIST_HEAD (entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move (struct list_head *list, struct list_head *head) { __list_del (list->prev, list->next); list_add (list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail (struct list_head *list, struct list_head *head) { __list_del (list->prev, list->next); list_add_tail (list, head); } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last (const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty (const struct list_head *head) { return head->next == head; } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful (const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular (const struct list_head *head) { return !list_empty (head) && (head->next == head->prev); } static inline void __list_cut_position (struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position (struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty (head)) return; if (list_is_singular (head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD (list); else __list_cut_position (list, head, entry); } static inline void __list_splice (const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice (const struct list_head *list, struct list_head *head) { if (!list_empty (list)) __list_splice (list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail (struct list_head *list, struct list_head *head) { if (!list_empty (list)) __list_splice (list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init (struct list_head *list, struct list_head *head) { if (!list_empty (list)) { __list_splice (list, head, head->next); INIT_LIST_HEAD (list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init (struct list_head *list, struct list_head *head) { if (!list_empty (list)) { __list_splice (list, head->prev, head); INIT_LIST_HEAD (list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. */ #define list_entry(ptr, type, member) container_of (ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry ((ptr)->next, type, member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; prefetch (pos->next), pos != (head); \ pos = pos->next) /** * __list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * This variant differs from list_for_each() in that it's the * simplest possible list iteration code, no prefetching is done. * Use this for code that knows the list to be very short (empty * or 1 entry) most of the time. */ #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; prefetch (pos->prev), pos != (head); \ pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal * of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ prefetch (pos->prev), pos != (head); pos = n, n = pos->prev) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry ((head)->next, typeof(*pos), member); \ prefetch (pos->member.next), &pos->member != (head); \ pos = list_entry (pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry ((head)->prev, typeof(*pos), member); \ prefetch (pos->member.prev), &pos->member != (head); \ pos = list_entry (pos->member.prev, typeof(*pos), member)) /** * list_prepare_entry - prepare a pos entry for use in * list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_struct within the struct. * * Prepares a pos entry for use as a start point in * list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ?: list_entry (head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry (pos->member.next, typeof(*pos), member); \ prefetch (pos->member.next), &pos->member != (head); \ pos = list_entry (pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given * point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_entry (pos->member.prev, typeof(*pos), member); \ prefetch (pos->member.prev), &pos->member != (head); \ pos = list_entry (pos->member.prev, typeof(*pos), member)) /** * list_for_each_entry_from - iterate over list of given type from the current * point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; prefetch (pos->member.next), &pos->member != (head); \ pos = list_entry (pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_safe - iterate over list of given type safe against * removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry ((head)->next, typeof(*pos), member), \ n = list_entry (pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry (n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_continue * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_entry (pos->member.next, typeof(*pos), member), \ n = list_entry (pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry (n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_from * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_entry (pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry (n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_reverse * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_entry ((head)->prev, typeof(*pos), member), \ n = list_entry (pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry (n->member.prev, typeof(*n), member)) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT \ { \ .first = NULL \ } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE (struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed (const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty (const struct hlist_head *h) { return !h->first; } static inline void __hlist_del (struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; *pprev = next; if (next) next->pprev = pprev; } static inline void hlist_del (struct hlist_node *n) { __hlist_del (n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_del_init (struct hlist_node *n) { if (!hlist_unhashed (n)) { __hlist_del (n); INIT_HLIST_NODE (n); } } static inline void hlist_add_head (struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before (struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_after (struct hlist_node *n, struct hlist_node *next) { next->next = n->next; n->next = next; next->pprev = &n->next; if (next->next) next->next->pprev = &next->next; } /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list (struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of (ptr, type, member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos && ({ \ prefetch (pos->next); \ 1; \ }); \ pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ \ n = pos->next; \ 1; \ }); \ pos = n) /** * hlist_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ pos && ({ \ prefetch (pos->next); \ 1; \ }) \ && ({ \ tpos = hlist_entry (pos, typeof(*tpos), member); \ 1; \ }); \ pos = pos->next) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after * current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(tpos, pos, member) \ for (pos = (pos)->next; \ pos && ({ \ prefetch (pos->next); \ 1; \ }) \ && ({ \ tpos = hlist_entry (pos, typeof(*tpos), member); \ 1; \ }); \ pos = pos->next) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current * point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(tpos, pos, member) \ for (; pos && ({ \ prefetch (pos->next); \ 1; \ }) \ && ({ \ tpos = hlist_entry (pos, typeof(*tpos), member); \ 1; \ }); \ pos = pos->next) /** * hlist_for_each_entry_safe - iterate over list of given type safe against * removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = (head)->first; \ pos && ({ \ n = pos->next; \ 1; \ }) \ && ({ \ tpos = hlist_entry (pos, typeof(*tpos), member); \ 1; \ }); \ pos = n) #endif 错误信息为In file included from main.c:4:0: main.c: In function ‘find_per’: list.h:417:8: warning: implicit declaration of function ‘prefetch’; did you mean ‘rpmatch’? [-Wimplicit-function-declaration] prefetch (pos->member.next), &pos->member != (head); \ ^ main.c:63:5: note: in expansion of macro ‘list_for_each_entry’ list_for_each_entry(tmp, head, node) ^~~~~~~~~~~~~~~~~~~ /tmp/cce8BVSj.o: In function `find_per': main.c:(.text+0x297): undefined reference to `prefetch' collect2: error: ld returned 1 exit status,该如何修改
最新发布
08-24
《大数据应用与实践》练习题 选择题 1.下列关于Spark特性的说法,正确的是( )。 A. Spark 仅支持 Java 语言开发 B. Scala可以自动推测类型 C. Spark 支持延迟计算 D. Spark 不支持内存计算 2.下面关于 Scala 特性说法正确的是( )。 A. Scala 不支持模式匹配 B. Scala 是纯面向对象语言 C. Scala 支持隐式转换 D. Scala 不能与 Java 互操作 3.下列 Scala 方法中,哪个方法可以正确获取列表的前 n 个元素( )。 A. take (n) B. head (n) C. first (n) D. front (n) 4.下列选项中,哪个端口不是 Hadoop 自带的服务端口( )。 A. 50070 B. 8088 C. 9000 D. 4040 5.以下哪个是 HBase 的数据模型特性( )? A. 行式存储 B. 强事务一致性 C. 列族 D. 二级索引 3.下列哪个组件属于 Flink 的流处理核心( )? A. JobManager B. Namenode C. ResourceManager D. HMaster 6.Hive 中用于存储元数据的数据库是( )。 A. MySQL B. HBase C. MongoDB D. Cassandra 7.下面哪个是 DataFrame 的特点( )。 A. 不可分区 B. 无结构信息 C. 支持 SQL 查询 D. 不可序列化 8.关于 Spark SQL,下面哪个是错误的( ) A. 支持多种数据源 B. 只能处理结构化数据 C. 提供 DataFrame 和 Dataset API D. 可与 Hive 集成 9.下列选项中,哪个属于行动算子操作( )。 A. map () B. filter () C. reduce () D. flatMap () 10.以下哪个操作会触发 Spark 的宽依赖(Wide Dependency)( )? A. map() B. filter() C. groupByKey() D. union() 11.YARN 中负责单个节点资源管理的组件是( )。 A. ApplicationMaster B. NodeManager C. ResourceManager D. JobTracker 12.Flink 的时间类型中,基于数据生成时间的是( )。 A. 处理时间(Processing Time) B. 事件时间(Event Time) C. 摄入时间(Ingestion Time) D. 系统时间(System Time) 13.Spark 应用程序的入口点是( )。 A. SparkContext B. SQLContext C. HiveContext D. SparkSession 14.下面哪个不是 Spark SQL 支持的数据源( )。 A. Parquet B. JSON C. CSV D. NoSQL 数据库 15.Spark 的内存管理模型中,Storage 内存主要用于( )。 A. 存储 RDD 缓存数据 B. 执行 Shuffle 操作 C. 存储任务执行时的临时数据 D. 存储广播变量 16.下列关于 HDFS 副本机制的说法,错误的是( )。 A. 默认副本数为 3 B. 副本分布在不同机架(Rack) C. 副本由 DataNode 自动管理 D. 副本数不可自定义 17. Spark SQL 中用于处理结构化数据的核心抽象是( )。 A. RDD B. DataFrame C. DStream D. Dataset[Row] 18.以下哪个工具用于 Hadoop 集群的配置管理( )? A. ZooKeeper B. Oozie C. Ambari D. Hive 19.下面关于Scala特性说法错误的是( )。 A. Java和Scala不可以混编 B. Scala可以自动推测类型 C. Scala有模式匹配 D. Scala是以面向函数编程 20.下列Scala方法中,哪个方法可以正确计算数组arr的长度( )。 A. Count B. take C. tail D. length 21.下列选项中,哪个不是Spark生态系统中的组件( )。 A. Spark Streaming B. Mllib C. Graphx D. Spark R 22.下列选项中,哪个端口不是Spark自带的服务端口( )。 A. 8080 B. 4040 C. 8090 D. 18080 23.Spark是Hadoop生态下哪个组件的替代方案( ) A. Hadoop B.Yarn C.Hdfs D.MapReduce 24.下列选项中,哪个不属于转换算子操作( )。 A. filter() B. map() C. collect() D. reduceByKey() 25.下列选项中,能使RDD产生宽依赖的是( )。 A. groupByKey() B. filter() C. flatMap() D. map() 26.Rdd持久化机制中,默认的存储级别是( )。 A. MEMORY_ONLY B. MEMORY_ONLY_SER C. MEMORY_AND_DISK D. MEMORY_AND_DISK_SER 27.下面哪个不是Spark的部署方式( )。 A. Spark on YARN B. Spark on Mesos C. Spark on Hadoop D. Standalone 28. Spark的阶段划分的依据是产生( )。 A.宽依赖 B.窄依赖 C.Task D.Application 填空题 1.Spark 的两种共享变量是广播变量和______________。 2.RDD 的转换操作具有______________特性,即它们不会立即执行。 3.Spark 生态系统包含 Spark Core、Spark SQL、________、MLlib、GraphX 以及独立调度组件。 4.Spark 提交作业时,使用本地模式并分配 4 个 CPU 核心的master参数值 是________。 5.Spark 提交作业时,连接到 YARN 集群的master参数值是________。 6.Hadoop 集群中负责资源调度的组件是________。 7.Spark 的调度模式分为 FIFO 和______________。 8.Spark Streaming 的______________机制用于保证数据处理的精确一次语义。 9.Spark RDD 的缓存级别默认是________。 10.Spark SQL 中,______________是 Spark 2.0 引入的统一入口点。 11.Spark Streaming中对DStream的转换操作会转变成对 的 转换操作。 判断题 1.Spark 的 DStream 是基于 RDD 的流式计算抽象。( ) 2.Spark 的累加器(Accumulator)支持跨任务修改。( ) 3.HDFS 不适合存储大量小文件。( ) 4.Hadoop的MapReduce作业中,通过磁盘进行数据Shuffle。( ) 5.SparkRDD 是不可变的分布式数据集( ) 6.Spark Streaming 是批处理框架,不支持实时处理( ) 7.DataFrame 是带有 schema 信息的 RDD( ) 8.Spark 的 shuffle 操作一定会导致数据的磁盘落盘( ) 9.HBase 的行键(Row Key)可不按字典序排序。( ) 10.Hive 的分区表可以提高查询效率。( ) 11.YARN 的 ResourceManager 负责全局资源调度。( ) 12.Spark 的广播变量可以在每个 Executor 上缓存一份( ) 13.Spark SQL 的 DataFrame API 只支持 SQL 查询,不支持函数式操作( ) 14.Spark 的 checkpoint 操作会将 RDD 保存到 HDFS 等可靠存储中( ) 15.Spark SQL 不支持直接操作 Hive 表。( ) 16.HBase 的列族在表创建后不可修改。( ) 17.Spark 任务的 Executor 可以运行多个 Task。( ) 18.Spark 的 Executor 是运行在 Worker 节点上的进程( ) 19.Spark 的 DStream 转换操作是懒执行的( ) 20.Spark 的内存管理模型中,Execution 内存和 Storage 内存可以动态调整( ) 简述题 1.简述SparkRDD的容错机制(Lineage)的工作原理。 2.对比Spark中reduceByKey和groupByKey的性能差异,并说明原因。 3.简述 Spark 的宽窄依赖及其对调度的影响。 4.比较 Spark 的 DataFrame 和 Dataset 的异同。 5.简述Kafka生产者(Producer)和消费者(Consumer)的主要功能及关键特性? 6.简述 Spark 的 Shuffle 过程及其优化策略。 7.SparkSQL中DataFrame和Dataset的主要区别是什么?各适用于什么场景? 8.简述HDFS中NameNode和DataNode的分工协作机制。 9.列举Spark的三种主要部署模式,并说明每种模式的适用场景。 10.如何理解 Spark 的弹性分布式数据集(RDD)? 11.简述 Spark 的资源调度流程。 12.解释 Spark 的容错机制。 13.Spark和Hadoop有什么区别? 程序题 编写函数输出1~100之间所有素数。 使用 Scala 语言按以下要求,编写出程序。 (1) 创建一个名字为 Person 的类,包含私有属性 name(String)和 age(Int) (2) 提供构造方法初始化这两个属性 (3) 定义一个方法 sayHello,输出 "Hello, my name is [name], I'm [age] years old." (4) 创建一个 Object 包含 main 方法,实例化 Person 类并调用 sayHello 方法 给定RDD Array(1, 2, 3, 4, 5),使用map和reduce算子计算所有元素的立方和。 创建一个包含元素 "spark", "hadoop", "flume", "date" 的 RDD,过滤出长度大于 5 的元素。 统计文本date.txt中每个单词出现的频率,并输出频率最高的前 3 个单词。 创建一个 RDD 数据为 Array (3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5),计算所有元素的平方和。 现有一份用户访问日志数据 datas/access.log,格式为 "user_id|timestamp|url",要求读取文件创建 RDD,统计每个用户的访问次数。(5 分) 示例数据: u001|2023-01-01 10:00:00|/home u002|2023-01-01 10:01:00|/profile u001|2023-01-01 10:02:00|/cart 答案: val logRDD = sc.textFile("datas/access.log")【1分】 val userCounts = logRDD .map(line => line.split("\\|")(0)) // 提取 user_id【1分】 .map(userId => (userId, 1)) // 映射为 (user_id, 1)【1分】 .reduceByKey(_ + _) // 聚合计数【1分】 userCounts.collect().foreach(println)【1分】 现有sales.txt 文件,记录了某商店的销售数据,请使用 Spark 编写程序计算每个类别的总销售额。数据格式为 "product_id,category,price,quantity",(10 分) 示例数据: P001,electronics,5000,2 P002,clothing,200,5 P003,electronics,3000,1 8.1 //创建 SparkSession 8.2 //读取文件创建 DataFrame 8.3 //计算每个类别的总销售额 8.4 //显示结果 8.5 //关闭 SparkSession /import org.apache.spark.sql.SparkSession // 5.1 创建 SparkSession val spark = SparkSession.builder() .appName("CategorySales") .master("local[*]") .getOrCreate() // 5.2 读取文件创建 DataFrame val df = spark.read .option("header", "false") .option("delimiter", ",") .csv("sales.txt") .toDF("product_id", "category", "price", "quantity") // 5.3 计算每个类别的总销售额 import spark.implicits._ val resultDF = df .select($"category", ($"price" * $"quantity").as("total")) .groupBy("category") .sum("total") .withColumnRenamed("sum(total)", "total_sales") // 5.4 显示结果 resultDF.show() // 5.5 关闭 SparkSession spark.stop() 现在有一份服务器日志数据datas/apache.log,要求读取文件创建RDD获取时间是2022年5月17日的请求数据(8分) 部分数据如下: 83.149.9.216--16/05/2022:10:05:03+0000GET /presentations/logstash-monitorama-2013/images/kibana-search.png 83.149.9.216--17/05/2022:10:05:43+0000ET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png 83.149.9.216--18/05/2022:10:05:47+0000GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js // 创建 SparkContext val conf = new SparkConf().setMaster("local[*]").setAppName("test") val sc = new SparkContext(conf) // 读取日志文件 val logRDD = sc.textFile("datas/apache.log") // 筛选日期为 17/05/2022 的记录 val filteredRDD = logRDD.filter { line => val datePattern = """\s17/05/2022:""".r // 匹配日期模式 datePattern.findFirstIn(line).isDefined } // 输出筛选结果 filteredRDD.foreach(println) // 关闭 SparkContext sc.stop() 现有Kafka 的主题test-topic,持续发送英文句子(每行一个句子)。使用 Spark Streaming 实时统计每个单词的出现次数,每5秒输出一次结果。请写出核心代码。(8分) // 创建 SparkConf 和 StreamingContext val conf = new SparkConf().setAppName("WordCount").setMaster("local[2]") val ssc = new StreamingContext(conf, Seconds(5)) // 批处理间隔5秒 // 创建 DStream val stream = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Set("test-topic"), kafkaParams) ) // 核心处理逻辑 val lines = stream.map(_.value()) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(w => (w, 1)).reduceByKey(_ + _) // 输出结果 wordCounts.print() // 启动流计算 ssc.start() ssc.awaitTermination() 有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称shop,数据如下:(10分) user_id shop u1 a u2 b u1 b u1 a u3 c u4 b 请使用spark sql语句统计: 6.1每个店铺的UV(访客数)(4分) 6.2每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数(6分) 6.1// 创建DataFrame val visitDF = data.toDF("user_id", "shop") // 注册为临时表 visitDF.createOrReplaceTempView("Visit") val uvResult = spark.sql( """SELECT shop, COUNT(DISTINCT user_id) AS uv FROM Visit GROUP BY shop ORDER BY uv DESC""" ) 6.2val top3Result = spark.sql( """WITH ranked_visits AS ( SELECT shop, user_id, COUNT(*) AS visit_count, ROW_NUMBER() OVER ( PARTITION BY shop ORDER BY COUNT(*) DESC ) AS rank FROM Visit GROUP BY shop, user_id ) SELECT shop, user_id, visit_count FROM ranked_visits WHERE rank <= 3 ORDER BY shop, rank""" ) 现有某电商平台的用户评分数据 ratings.csv,格式为 "user_id,movie_id,rating,timestamp",请使用 Spark SQL 查询出评分最高的前 10 部电影(需考虑评分人数不少于 10 人)。(15 分) 示例数据: 1,101,5,1612345678 2,101,4,1612345679 3,102,3,1612345680 ... 请给出关键 SQL 查询语句。 import org.apache.spark.sql.SparkSession object MovieRatingAnalysis { def main(args: Array[String]): Unit = { // 创建 SparkSession val spark = SparkSession.builder() .appName("TopMoviesByRating") .master("local[*]") .getOrCreate() // 读取数据 val ratingsDF = spark.read .option("header", "false") .option("inferSchema", "true") .csv("ratings.csv") .toDF("user_id", "movie_id", "rating", "timestamp") // 注册临时视图 ratingsDF.createOrReplaceTempView("ratings") // 执行 SQL 查询 val topMoviesDF = spark.sql( """ SELECT movie_id, AVG(rating) AS avg_rating, COUNT(*) AS num_ratings FROM ratings 回答问题
06-20
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值